From 2ed2fd6f4f1c65141f08de02f3680dfb22e55f58 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 2 Apr 2024 17:28:07 -0700
Subject: [PATCH 01/80] fix cosmetic issues

---
 .../sampling_post_processing_impl.cuh         | 135 +++++++++---------
 .../sampling/sampling_post_processing_test.cu |  20 +--
 2 files changed, 70 insertions(+), 85 deletions(-)

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 299aae13718..d8e081060d4 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -218,7 +218,7 @@ void check_input_edges(
         std::get<0>(*edgelist_label_offsets).data() + std::get<1>(*edgelist_label_offsets),
         size_t{1},
         handle.get_stream());
-      handle.get_stream();
+      handle.sync_stream();
       CUGRAPH_EXPECTS(
         back_element == edgelist_srcs.size(),
         "Invalid input arguments: if edgelist_label_offsets is valid, the last element of "
@@ -258,6 +258,7 @@ compute_min_hop_for_unique_label_vertex_pairs(
     std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
 
     if (hops) {
+      // FIXME: why not use cub::DeviceSegmentedSort::SortPairs???
       tmp_vertices.resize(vertices.size(), handle.get_stream());
       thrust::copy(
         handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
@@ -1617,47 +1618,47 @@ renumber_and_sort_sampled_edgelist(
                  (*edgelist_label_hop_offsets).begin(),
                  (*edgelist_label_hop_offsets).end(),
                  size_t{0});
-    // FIXME: the device lambda should be placed in cuda::proclaim_return_type<size_t>()
-    // once we update CCCL version to 2.x
     thrust::transform(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
       thrust::make_counting_iterator(num_labels * num_hops),
       (*edgelist_label_hop_offsets).begin(),
-      [edgelist_label_offsets = edgelist_label_offsets
-                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
-                                  : thrust::nullopt,
-       edgelist_hops          = edgelist_hops
-                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+      cuda::proclaim_return_type<size_t>(
+        [edgelist_label_offsets = edgelist_label_offsets
+                                    ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                    : thrust::nullopt,
+         edgelist_hops =
+           edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
                              std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
-                                  : thrust::nullopt,
-       num_hops,
-       num_edges = edgelist_majors.size()] __device__(size_t i) {
-        size_t start_offset{0};
-        auto end_offset = num_edges;
-
-        if (edgelist_label_offsets) {
-          auto l_idx   = static_cast<label_index_t>(i / num_hops);
-          start_offset = (*edgelist_label_offsets)[l_idx];
-          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
-        }
+                         : thrust::nullopt,
+         num_hops,
+         num_edges = edgelist_majors.size()] __device__(size_t i) {
+          size_t start_offset{0};
+          auto end_offset = num_edges;
+
+          if (edgelist_label_offsets) {
+            auto l_idx   = static_cast<label_index_t>(i / num_hops);
+            start_offset = (*edgelist_label_offsets)[l_idx];
+            end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+          }
 
-        if (edgelist_hops) {
-          auto h        = static_cast<int32_t>(i % num_hops);
-          auto lower_it = thrust::lower_bound(thrust::seq,
-                                              (*edgelist_hops).begin() + start_offset,
-                                              (*edgelist_hops).begin() + end_offset,
-                                              h);
-          auto upper_it = thrust::upper_bound(thrust::seq,
-                                              (*edgelist_hops).begin() + start_offset,
-                                              (*edgelist_hops).begin() + end_offset,
-                                              h);
-          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
-          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
-        }
+          if (edgelist_hops) {
+            auto h        = static_cast<int32_t>(i % num_hops);
+            auto lower_it = thrust::lower_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            auto upper_it = thrust::upper_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            start_offset =
+              static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+            end_offset = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+          }
 
-        return end_offset - start_offset;
-      });
+          return end_offset - start_offset;
+        }));
     thrust::exclusive_scan(handle.get_thrust_policy(),
                            (*edgelist_label_hop_offsets).begin(),
                            (*edgelist_label_hop_offsets).end(),
@@ -1744,47 +1745,47 @@ sort_sampled_edgelist(
                  (*edgelist_label_hop_offsets).begin(),
                  (*edgelist_label_hop_offsets).end(),
                  size_t{0});
-    // FIXME: the device lambda should be placed in cuda::proclaim_return_type<size_t>()
-    // once we update CCCL version to 2.x
     thrust::transform(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
       thrust::make_counting_iterator(num_labels * num_hops),
       (*edgelist_label_hop_offsets).begin(),
-      [edgelist_label_offsets = edgelist_label_offsets
-                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
-                                  : thrust::nullopt,
-       edgelist_hops          = edgelist_hops
-                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+      cuda::proclaim_return_type<size_t>(
+        [edgelist_label_offsets = edgelist_label_offsets
+                                    ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                    : thrust::nullopt,
+         edgelist_hops =
+           edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
                              std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
-                                  : thrust::nullopt,
-       num_hops,
-       num_edges = edgelist_majors.size()] __device__(size_t i) {
-        size_t start_offset{0};
-        auto end_offset = num_edges;
-
-        if (edgelist_label_offsets) {
-          auto l_idx   = static_cast<label_index_t>(i / num_hops);
-          start_offset = (*edgelist_label_offsets)[l_idx];
-          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
-        }
+                         : thrust::nullopt,
+         num_hops,
+         num_edges = edgelist_majors.size()] __device__(size_t i) {
+          size_t start_offset{0};
+          auto end_offset = num_edges;
+
+          if (edgelist_label_offsets) {
+            auto l_idx   = static_cast<label_index_t>(i / num_hops);
+            start_offset = (*edgelist_label_offsets)[l_idx];
+            end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+          }
 
-        if (edgelist_hops) {
-          auto h        = static_cast<int32_t>(i % num_hops);
-          auto lower_it = thrust::lower_bound(thrust::seq,
-                                              (*edgelist_hops).begin() + start_offset,
-                                              (*edgelist_hops).begin() + end_offset,
-                                              h);
-          auto upper_it = thrust::upper_bound(thrust::seq,
-                                              (*edgelist_hops).begin() + start_offset,
-                                              (*edgelist_hops).begin() + end_offset,
-                                              h);
-          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
-          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
-        }
+          if (edgelist_hops) {
+            auto h        = static_cast<int32_t>(i % num_hops);
+            auto lower_it = thrust::lower_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            auto upper_it = thrust::upper_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            start_offset =
+              static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+            end_offset = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+          }
 
-        return end_offset - start_offset;
-      });
+          return end_offset - start_offset;
+        }));
     thrust::exclusive_scan(handle.get_thrust_policy(),
                            (*edgelist_label_hop_offsets).begin(),
                            (*edgelist_label_hop_offsets).end(),
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index c93994ddfad..ce45ea08162 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -581,14 +581,6 @@ class Tests_SamplingPostProcessing
       std::optional<rmm::device_uvector<size_t>> renumbered_and_sorted_renumber_map_label_offsets{
         std::nullopt};
 
-      {
-        size_t free_size{};
-        size_t total_size{};
-        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
-        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
-                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
-                  << std::endl;
-      }
       if (cugraph::test::g_perf) {
         RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
         hr_timer.start("Renumber and sort sampled edgelist");
@@ -1144,14 +1136,6 @@ class Tests_SamplingPostProcessing
 
       std::optional<rmm::device_uvector<size_t>> sorted_edgelist_label_hop_offsets{std::nullopt};
 
-      {
-        size_t free_size{};
-        size_t total_size{};
-        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
-        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
-                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
-                  << std::endl;
-      }
       if (cugraph::test::g_perf) {
         RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
         hr_timer.start("Sort sampled edgelist");
@@ -1370,8 +1354,8 @@ INSTANTIATE_TEST_SUITE_P(
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, true},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, true, true},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, true}),
-    ::testing::Values(cugraph::test::File_Usecase("karate.mtx"),
-                      cugraph::test::File_Usecase("dolphins.mtx"))));
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,

From e16c0c266eec2d831264535e4b8e4950315f37fb Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 4 Apr 2024 10:04:21 -0700
Subject: [PATCH 02/80] update sampling post processing functions to take
 additional input parameters

---
 cpp/include/cugraph/sampling_functions.hpp    | 124 ++---
 cpp/src/c_api/uniform_neighbor_sampling.cpp   |  74 +--
 .../sampling_post_processing_impl.cuh         | 442 +++++++++++-------
 .../sampling/sampling_post_processing_sg.cu   | 234 ++++++----
 .../sampling/sampling_post_processing_test.cu |  66 ++-
 5 files changed, 554 insertions(+), 386 deletions(-)

diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index c83e1f48972..020e5a7f9b3 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -82,11 +82,20 @@ namespace cugraph {
  * edgelist_srcs.size() if valid).
  * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
  * edgelist_srcs.size() if valid).
- * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
- * edgelist_srcs.size() if valid) and the number of hops.
- * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
- * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
- * labels.
+ * @param edgelist_hops An optional vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid). @p edgelist_hops should be valid if @p num_hops >= 2. The hop
+ * vector values should be non-decreasing within each label.
+ * @param seed_vertices An optional pointer to the array storing seed vertices in hop 0.
+ * @param seed_vertex_label_offsets An optional pointer to the array storing label offsets to the
+ * seed vertices (size = @p num_labels + 1). @p seed_vertex_label_offsets should be valid if @p
+ * num_labels >= 2 and @p seed_vertices is valid and invalid otherwise.
+ * @param edgelist_label_offsets An optional pointer to the array storing label offsets to the input
+ * edges (size = @p num_labels + 1). @p edgelist_label_offsets should be valid if @p num_labels
+ * >= 2.
+ * @param num_labels Number of labels. Labels are considered if @p num_labels >=2 and ignored if @p
+ * num_labels = 1.
+ * @param num_hops Number of hops. Hop numbers are considered if @p num_hops >=2 and ignored if @p
+ * num_hops = 1.
  * @param src_is_major A flag to determine whether to use the source or destination as the
  * major key in renumbering and compression.
  * @param compress_per_hop A flag to determine whether to compress edges with different hop numbers
@@ -100,13 +109,10 @@ namespace cugraph {
  * edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
  * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
  * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the
- * (D)CSR|(D)CSC offset array (size = # labels * # hops + 1, where # labels =
- * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
- * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
- * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
- * edgelist_hops.has_value() is true), renumber_map to query original vertices (size = # unique
- * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
- * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * (D)CSR|(D)CSC offset array (size = @p num_labels * @p num_hops + 1, valid only when @p
+ * edgelist_hops.has_value() or @p edgelist_label_offsets.has_value() is true), renumber_map to
+ * query original vertices (size = # unique or aggregate # unique_vertices for each label), and
+ * label offsets to the renumber_map (size = num_labels + 1, valid only if @p
  * edgelist_label_offsets.has_value() is true).
  */
 template <typename vertex_t,
@@ -130,8 +136,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
   std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<vertex_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major       = true,
   bool compress_per_hop   = false,
   bool doubly_compress    = false,
@@ -180,12 +190,20 @@ renumber_and_compress_sampled_edgelist(
  * edgelist_srcs.size() if valid).
  * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
  * edgelist_srcs.size() if valid).
- * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
- * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
- * non-decreasing within each label.
- * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
- * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
- * labels.
+ * @param edgelist_hops An optional vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid). @p edgelist_hops should be valid if @p num_hops >= 2. The hop
+ * vector values should be non-decreasing within each label.
+ * @param seed_vertices An optional pointer to the array storing seed vertices in hop 0.
+ * @param seed_vertex_label_offsets An optional pointer to the array storing label offsets to the
+ * seed vertices (size = @p num_labels + 1). @p seed_vertex_label_offsets should be valid if @p
+ * num_labels >= 2 and @p seed_vertices is valid and invalid otherwise.
+ * @param edgelist_label_offsets An optional pointer to the array storing label offsets to the input
+ * edges (size = @p num_labels + 1). @p edgelist_label_offsets should be valid if @p num_labels
+ * >= 2.
+ * @param num_labels Number of labels. Labels are considered if @p num_labels >=2 and ignored if @p
+ * num_labels = 1.
+ * @param num_hops Number of hops. Hop numbers are considered if @p num_hops >=2 and ignored if @p
+ * num_hops = 1.
  * @param src_is_major A flag to determine whether to use the source or destination as the
  * major key in renumbering and sorting.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
@@ -193,13 +211,10 @@ renumber_and_compress_sampled_edgelist(
  * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
  * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
  * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the renumbered
- * and sorted edges (size = # labels * # hops + 1, where # labels =
- * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
- * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
- * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
- * edgelist_hops.has_value() is true), renumber_map to query original vertices (size = # unique
- * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
- * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * and sorted edges (size = @p num_labels * @p num_hops + 1, valid only when @p
+ * edgelist_hops.has_value() or @p edgelist_label_offsetes.has_value() is true), renumber_map to
+ * query original vertices (size = # unique or aggregate # unique vertices for each label), and
+ * label offsets to the renumber map (size = @p num_labels + 1, valid only if @p
  * edgelist_label_offsets.has_value() is true).
  */
 template <typename vertex_t,
@@ -221,8 +236,12 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
   std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<vertex_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major       = true,
   bool do_expensive_check = false);
 
@@ -253,24 +272,24 @@ renumber_and_sort_sampled_edgelist(
  * edgelist_srcs.size() if valid).
  * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
  * edgelist_srcs.size() if valid).
- * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
- * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
- * non-decreasing within each label.
- * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
- * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
- * labels.
+ * @param edgelist_hops An optional vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid). @p edgelist_hops must be valid if @p num_hops >= 2. The hop
+ * vector values should be non-decreasing within each label.
+ * @param edgelist_label_offsets An optional pointer to the array storing label offsets to the input
+ * edges (size = @p num_labels + 1). @p edgelist_label_offsets must be valid if @p num_labels >= 2.
+ * @param num_labels Number of labels. Labels are considered if @p num_labels >=2 and ignored if @p
+ * num_labels = 1.
+ * @param num_hops Number of hops. Hop numbers are considered if @p num_hops >=2 and ignored if @p
+ * num_hops = 1.
  * @param src_is_major A flag to determine whether to use the source or destination as the
  * major key in renumbering and sorting.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
  * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
  * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
- * edgelist_edge_types.has_value() is true), and optional (label, hop) offset values to the
- * renumbered and sorted edges (size = # labels * # hops + 1, where # labels =
- * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
- * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
- * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
- * edgelist_hops.has_value() is true)
+ * edgelist_edge_types.has_value() is true), and optional (label, hop) offset values to the sorted
+ * edges (size = @p num_labels * @p num_hops + 1, valid only when @p edgelist_hops.has_value() or @p
+ * edgelist_label_offsets.has_value() is true).
  */
 template <typename vertex_t,
           typename weight_t,
@@ -282,16 +301,17 @@ std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
            std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
            std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
            std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>&& edgelist_srcs,
-  rmm::device_uvector<vertex_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major       = true,
-  bool do_expensive_check = false);
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<vertex_t>&& edgelist_srcs,
+                      rmm::device_uvector<vertex_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major       = true,
+                      bool do_expensive_check = false);
 
 }  // namespace cugraph
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 44018e088f7..162d2c6c675 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -178,7 +178,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       }
 
       //
-      // Need to renumber personalization_vertices
+      // Need to renumber start_vertices
       //
       cugraph::renumber_local_ext_vertices<vertex_t, multi_gpu>(
         handle_,
@@ -189,8 +189,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
         graph_view.local_vertex_partition_range_last(),
         do_expensive_check_);
 
-      bool has_labels = start_vertex_labels_ != nullptr;
-
       auto&& [src, dst, wgt, edge_id, edge_type, hop, edge_label, offsets] =
         cugraph::uniform_neighbor_sample(
           handle_,
@@ -261,19 +259,21 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                    label_hop_offsets,
                    output_renumber_map,
                    renumber_map_offsets) =
-            cugraph::renumber_and_sort_sampled_edgelist(
+            cugraph::renumber_and_sort_sampled_edgelist<vertex_t>(
               handle_,
               std::move(src),
               std::move(dst),
-              wgt ? std::move(wgt) : std::nullopt,
-              edge_id ? std::move(edge_id) : std::nullopt,
-              edge_type ? std::move(edge_type) : std::nullopt,
-              hop ? std::make_optional(std::make_tuple(std::move(*hop), fan_out_->size_))
-                  : std::nullopt,
-              offsets ? std::make_optional(std::make_tuple(
-                          raft::device_span<size_t const>{offsets->data(), offsets->size()},
-                          edge_label->size()))
+              std::move(wgt),
+              std::move(edge_id),
+              std::move(edge_type),
+              std::move(hop),
+              std::nullopt,
+              std::nullopt,
+              offsets ? std::make_optional(
+                          raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
+              edge_label->size(),
+              fan_out_->size_,
               src_is_major,
               do_expensive_check_);
 
@@ -296,19 +296,21 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                    label_hop_offsets,
                    output_renumber_map,
                    renumber_map_offsets) =
-            cugraph::renumber_and_compress_sampled_edgelist(
+            cugraph::renumber_and_compress_sampled_edgelist<vertex_t>(
               handle_,
               std::move(src),
               std::move(dst),
-              wgt ? std::move(wgt) : std::nullopt,
-              edge_id ? std::move(edge_id) : std::nullopt,
-              edge_type ? std::move(edge_type) : std::nullopt,
-              hop ? std::make_optional(std::make_tuple(std::move(*hop), fan_out_->size_))
-                  : std::nullopt,
-              offsets ? std::make_optional(std::make_tuple(
-                          raft::device_span<size_t const>{offsets->data(), offsets->size()},
-                          edge_label->size()))
+              std::move(wgt),
+              std::move(edge_id),
+              std::move(edge_type),
+              std::move(hop),
+              std::nullopt,
+              std::nullopt,
+              offsets ? std::make_optional(
+                          raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
+              edge_label->size(),
+              fan_out_->size_,
               src_is_major,
               options_.compress_per_hop_,
               doubly_compress,
@@ -327,21 +329,21 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
         }
 
         std::tie(src, dst, wgt, edge_id, edge_type, label_hop_offsets) =
-          cugraph::sort_sampled_edgelist(
-            handle_,
-            std::move(src),
-            std::move(dst),
-            wgt ? std::move(wgt) : std::nullopt,
-            edge_id ? std::move(edge_id) : std::nullopt,
-            edge_type ? std::move(edge_type) : std::nullopt,
-            hop ? std::make_optional(std::make_tuple(std::move(*hop), fan_out_->size_))
-                : std::nullopt,
-            offsets ? std::make_optional(std::make_tuple(
-                        raft::device_span<size_t const>{offsets->data(), offsets->size()},
-                        edge_label->size()))
-                    : std::nullopt,
-            src_is_major,
-            do_expensive_check_);
+          cugraph::sort_sampled_edgelist(handle_,
+                                         std::move(src),
+                                         std::move(dst),
+                                         std::move(wgt),
+                                         std::move(edge_id),
+                                         std::move(edge_type),
+                                         std::move(hop),
+                                         offsets
+                                           ? std::make_optional(raft::device_span<size_t const>{
+                                               offsets->data(), offsets->size()})
+                                           : std::nullopt,
+                                         edge_label->size(),
+                                         fan_out_->size_,
+                                         src_is_major,
+                                         do_expensive_check_);
 
         majors.emplace(std::move(src));
         minors = std::move(dst);
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index d8e081060d4..5060b283659 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -154,75 +154,174 @@ template <typename label_index_t,
           typename weight_t,
           typename edge_id_t,
           typename edge_type_t>
-void check_input_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t> const& edgelist_srcs,
-  rmm::device_uvector<vertex_t> const& edgelist_dsts,
-  std::optional<rmm::device_uvector<weight_t>> const& edgelist_weights,
-  std::optional<rmm::device_uvector<edge_id_t>> const& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<edge_type_t>> const& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>> const& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool do_expensive_check)
+void check_input_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<vertex_t> const& edgelist_majors,
+                       rmm::device_uvector<vertex_t> const& edgelist_minors,
+                       std::optional<rmm::device_uvector<weight_t>> const& edgelist_weights,
+                       std::optional<rmm::device_uvector<edge_id_t>> const& edgelist_edge_ids,
+                       std::optional<rmm::device_uvector<edge_type_t>> const& edgelist_edge_types,
+                       std::optional<rmm::device_uvector<int32_t>> const& edgelist_hops,
+                       std::optional<raft::device_span<vertex_t const>> seed_vertices,
+                       std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+                       std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                       size_t num_labels,
+                       size_t num_hops,
+                       bool do_expensive_check)
 {
-  CUGRAPH_EXPECTS(!edgelist_label_offsets || (std::get<1>(*edgelist_label_offsets) <=
-                                              std::numeric_limits<label_index_t>::max()),
-                  "Invalid input arguments: current implementation assumes that the number of "
-                  "unique labels is no larger than std::numeric_limits<uint32_t>::max().");
+  CUGRAPH_EXPECTS((num_labels >= 1) && (num_labels <= std::numeric_limits<label_index_t>::max()),
+                  "Invalid input arguments: num_labels should be a positive integer and the "
+                  "current implementation assumes that the number of unique labels is no larger "
+                  "than std::numeric_limits<uint32_t>::max().");
+  CUGRAPH_EXPECTS(
+    ((num_labels == 1) && !edgelist_label_offsets.has_value()) ||
+      (num_labels >= 2 && edgelist_label_offsets.has_value()),
+    "Invalid input arguments: edgelist_label_offsets should be std::nullopt if num_labels == 1 and "
+    "edgelist_label_offsets.has_value() should be true if num_labels >= 2.");
+  CUGRAPH_EXPECTS(
+    !edgelist_label_offsets.has_value() || ((*edgelist_label_offsets).size() == num_labels + 1),
+    "Invalid input arguments: if edgelist_label_offsets is valid, (*edgelist_label_offsets).size() "
+    "(size of the offset array) should be num_labels + 1.");
 
   CUGRAPH_EXPECTS(
-    !edgelist_label_offsets.has_value() ||
-      (std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1),
-    "Invalid input arguments: if edgelist_label_offsets is valid, "
-    "std::get<0>(*edgelist_label_offsets).size() (size of the offset array) should be "
-    "std::get<1>(*edgelist_label_offsets) (number of unique labels) + 1.");
+    (num_hops >= 1) && (num_hops <= std::numeric_limits<int32_t>::max()),
+    "Invalid input arguments: num_hops should be a positive integer and the current implementation "
+    "assumes that the number of hops is no larger than std::numeric_limits<int32_t>::max().");
+  CUGRAPH_EXPECTS(
+    ((num_hops == 1) && !edgelist_hops.has_value()) || (num_hops >= 2 && edgelist_hops.has_value()),
+    "Invalid input arguments: edgelist_hops should be std::nullopt if num_hops == 1 and "
+    "edgelist_hops.has_value() should be true if num_hops >= 2.");
 
   CUGRAPH_EXPECTS(
-    !edgelist_hops || (std::get<1>(*edgelist_hops) <= std::numeric_limits<int32_t>::max()),
-    "Invalid input arguments: current implementation assumes that the number of "
-    "hops is no larger than std::numeric_limits<int32_t>::max().");
-  CUGRAPH_EXPECTS(!edgelist_hops || std::get<1>(*edgelist_hops) > 0,
-                  "Invlaid input arguments: number of hops should be larger than 0 if "
-                  "edgelist_hops.has_value() is true.");
+    ((!seed_vertices.has_value() || (num_labels == 1)) && !seed_vertex_label_offsets.has_value()) ||
+      ((seed_vertices.has_value() && (num_labels >= 2)) && seed_vertex_label_offsets.has_value()),
+    "Invaild input arguments: if seed_vertices.has_value() is true and num_labels >= 2, "
+    "seed_vertex_label_offsets.has_value() should be true. Otherwise, "
+    "seed_vertex_label_offsets.has_value() should be false.");
+  CUGRAPH_EXPECTS(
+    !seed_vertex_label_offsets.has_value() ||
+      ((*seed_vertex_label_offsets).size() == num_labels + 1),
+    "Invalid input arguments: if seed_vertex_label_offsets is valid, "
+    "(*seed_vertex_label_offsets).size() (size of the offset array) should be num_labels + 1.");
 
   CUGRAPH_EXPECTS(
-    edgelist_srcs.size() == edgelist_dsts.size(),
+    edgelist_majors.size() == edgelist_minors.size(),
     "Invalid input arguments: edgelist_srcs.size() and edgelist_dsts.size() should coincide.");
   CUGRAPH_EXPECTS(
-    !edgelist_weights.has_value() || (edgelist_srcs.size() == (*edgelist_weights).size()),
-    "Invalid input arguments: if edgelist_weights is valid, std::get<0>(*edgelist_weights).size() "
-    "and edgelist_srcs.size() should coincide.");
-  CUGRAPH_EXPECTS(
-    !edgelist_edge_ids.has_value() || (edgelist_srcs.size() == (*edgelist_edge_ids).size()),
-    "Invalid input arguments: if edgelist_edge_ids is valid, "
-    "std::get<0>(*edgelist_edge_ids).size() and edgelist_srcs.size() should coincide.");
+    !edgelist_weights.has_value() || (edgelist_majors.size() == (*edgelist_weights).size()),
+    "Invalid input arguments: if edgelist_weights is valid, (*edgelist_weights).size() and "
+    "edgelist_(srcs|dsts).size() should coincide.");
   CUGRAPH_EXPECTS(
-    !edgelist_edge_types.has_value() || (edgelist_srcs.size() == (*edgelist_edge_types).size()),
-    "Invalid input arguments: if edgelist_edge_types is valid, "
-    "std::get<0>(*edgelist_edge_types).size() and edgelist_srcs.size() should coincide.");
+    !edgelist_edge_ids.has_value() || (edgelist_majors.size() == (*edgelist_edge_ids).size()),
+    "Invalid input arguments: if edgelist_edge_ids is valid, (*edgelist_edge_ids).size() and "
+    "edgelist_(srcs|dsts).size() should coincide.");
   CUGRAPH_EXPECTS(
-    !edgelist_hops.has_value() || (edgelist_srcs.size() == std::get<0>(*edgelist_hops).size()),
-    "Invalid input arguments: if edgelist_hops is valid, std::get<0>(*edgelist_hops).size() and "
-    "edgelist_srcs.size() should coincide.");
+    !edgelist_edge_types.has_value() || (edgelist_majors.size() == (*edgelist_edge_types).size()),
+    "Invalid input arguments: if edgelist_edge_types is valid, (*edgelist_edge_types).size() and "
+    "edgelist_(srcs|dsts).size() should coincide.");
+  CUGRAPH_EXPECTS(!edgelist_hops.has_value() || (edgelist_majors.size() == (*edgelist_hops).size()),
+                  "Invalid input arguments: if edgelist_hops is valid, (*edgelist_hops).size() and "
+                  "edgelist_(srcs|dsts).size() should coincide.");
 
   if (do_expensive_check) {
     if (edgelist_label_offsets) {
       CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),
-                                        std::get<0>(*edgelist_label_offsets).begin(),
-                                        std::get<0>(*edgelist_label_offsets).end()),
+                                        (*edgelist_label_offsets).begin(),
+                                        (*edgelist_label_offsets).end()),
                       "Invalid input arguments: if edgelist_label_offsets is valid, "
-                      "std::get<0>(*edgelist_label_offsets) should be sorted.");
+                      "*edgelist_label_offsets should be sorted.");
       size_t back_element{};
-      raft::update_host(
-        &back_element,
-        std::get<0>(*edgelist_label_offsets).data() + std::get<1>(*edgelist_label_offsets),
-        size_t{1},
-        handle.get_stream());
+      raft::update_host(&back_element,
+                        (*edgelist_label_offsets).data() + num_labels,
+                        size_t{1},
+                        handle.get_stream());
       handle.sync_stream();
       CUGRAPH_EXPECTS(
-        back_element == edgelist_srcs.size(),
+        back_element == edgelist_majors.size(),
         "Invalid input arguments: if edgelist_label_offsets is valid, the last element of "
-        "std::get<0>(*edgelist_label_offsets) and edgelist_srcs.size() should coincide.");
+        "*edgelist_label_offsets and edgelist_(srcs|dsts).size() should coincide.");
+    }
+
+    if (seed_vertices) {
+      for (size_t i = 0; i < num_labels; ++i) {
+        rmm::device_uvector<vertex_t> this_label_seed_vertices(0, handle.get_stream());
+        {
+          size_t start_offset{0};
+          auto end_offset = (*seed_vertices).size();
+          if (seed_vertex_label_offsets) {
+            raft::update_host(
+              &start_offset, (*seed_vertex_label_offsets).data() + i, 1, handle.get_stream());
+            raft::update_host(
+              &end_offset, (*seed_vertex_label_offsets).data() + (i + 1), 1, handle.get_stream());
+            handle.sync_stream();
+          }
+          this_label_seed_vertices.resize(end_offset - start_offset, handle.get_stream());
+          thrust::copy(handle.get_thrust_policy(),
+                       (*seed_vertices).begin() + start_offset,
+                       (*seed_vertices).begin() + end_offset,
+                       this_label_seed_vertices.begin());
+          thrust::sort(handle.get_thrust_policy(),
+                       this_label_seed_vertices.begin(),
+                       this_label_seed_vertices.end());
+          this_label_seed_vertices.resize(
+            thrust::distance(this_label_seed_vertices.begin(),
+                             thrust::unique(handle.get_thrust_policy(),
+                                            this_label_seed_vertices.begin(),
+                                            this_label_seed_vertices.end())),
+            handle.get_stream());
+        }
+
+        rmm::device_uvector<vertex_t> this_label_zero_hop_majors(0, handle.get_stream());
+        {
+          size_t start_offset{0};
+          auto end_offset = edgelist_majors.size();
+          if (edgelist_label_offsets) {
+            raft::update_host(
+              &start_offset, (*edgelist_label_offsets).data() + i, 1, handle.get_stream());
+            raft::update_host(
+              &end_offset, (*edgelist_label_offsets).data() + (i + 1), 1, handle.get_stream());
+            handle.sync_stream();
+          }
+
+          if (edgelist_hops) {
+            this_label_zero_hop_majors.resize(
+              thrust::distance(this_label_zero_hop_majors.begin(),
+                               thrust::copy_if(handle.get_thrust_policy(),
+                                               edgelist_majors.begin() + start_offset,
+                                               edgelist_majors.begin() + end_offset,
+                                               (*edgelist_hops).begin() + start_offset,
+                                               this_label_zero_hop_majors.begin(),
+                                               detail::is_equal_t<int32_t>{0})),
+              handle.get_stream());
+          } else {
+            thrust::copy(handle.get_thrust_policy(),
+                         edgelist_majors.begin() + start_offset,
+                         edgelist_majors.begin() + end_offset,
+                         this_label_zero_hop_majors.begin());
+          }
+          thrust::sort(handle.get_thrust_policy(),
+                       this_label_zero_hop_majors.begin(),
+                       this_label_zero_hop_majors.end());
+          this_label_zero_hop_majors.resize(
+            thrust::distance(this_label_zero_hop_majors.begin(),
+                             thrust::unique(handle.get_thrust_policy(),
+                                            this_label_zero_hop_majors.begin(),
+                                            this_label_zero_hop_majors.end())),
+            handle.get_stream());
+        }
+
+        rmm::device_uvector<vertex_t> zero_hop_majors_minus_seed_vertices(
+          this_label_zero_hop_majors.size(), handle.get_stream());
+        CUGRAPH_EXPECTS(thrust::distance(
+                          zero_hop_majors_minus_seed_vertices.begin(),
+                          thrust::set_difference(handle.get_thrust_policy(),
+                                                 this_label_zero_hop_majors.begin(),
+                                                 this_label_zero_hop_majors.end(),
+                                                 this_label_seed_vertices.begin(),
+                                                 this_label_seed_vertices.end(),
+                                                 zero_hop_majors_minus_seed_vertices.begin())) == 0,
+                        "Invalid input arguments: if seed_vertices.has_value() is true, "
+                        "seed_vertices should include all zero-hop majors.");
+      }
     }
   }
 }
@@ -394,8 +493,12 @@ compute_renumber_map(raft::handle_t const& handle,
                      raft::device_span<vertex_t const> edgelist_majors,
                      raft::device_span<vertex_t const> edgelist_minors,
                      std::optional<raft::device_span<int32_t const>> edgelist_hops,
+                     std::optional<raft::device_span<vertex_t const>> seed_vertices,
+                     std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
 {
+  if (seed_vertices) { CUGRAPH_FAIL("unimplemented."); }
+
   auto approx_edges_to_sort_per_iteration =
     static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
     (1 << 20) /* tuning parameter */;  // for segmented sort
@@ -648,13 +751,15 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<size_t>>>
-renumber_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>&& edgelist_majors,
-  rmm::device_uvector<vertex_t>&& edgelist_minors,
-  std::optional<std::tuple<raft::device_span<int32_t const>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool do_expensive_check)
+renumber_sampled_edgelist(raft::handle_t const& handle,
+                          rmm::device_uvector<vertex_t>&& edgelist_majors,
+                          rmm::device_uvector<vertex_t>&& edgelist_minors,
+                          std::optional<raft::device_span<int32_t const>>&& edgelist_hops,
+                          std::optional<raft::device_span<vertex_t const>> seed_vertices,
+                          std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+                          std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                          size_t num_labels,
+                          bool do_expensive_check)
 {
   // 1. compute renumber_map
 
@@ -662,12 +767,10 @@ renumber_sampled_edgelist(
     handle,
     raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
     raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
-    edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
-                      std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
-                  : std::nullopt,
-    edgelist_label_offsets
-      ? std::make_optional<raft::device_span<size_t const>>(std::get<0>(*edgelist_label_offsets))
-      : std::nullopt);
+    edgelist_hops,
+    seed_vertices,
+    seed_vertex_label_offsets,
+    edgelist_label_offsets);
 
   // 2. compute renumber map offsets for each label
 
@@ -687,8 +790,7 @@ renumber_sampled_edgelist(
                           unique_label_indices.begin(),
                           vertex_counts.begin());
 
-    renumber_map_label_offsets =
-      rmm::device_uvector<size_t>(std::get<1>(*edgelist_label_offsets) + 1, handle.get_stream());
+    renumber_map_label_offsets = rmm::device_uvector<size_t>(num_labels + 1, handle.get_stream());
     thrust::fill(handle.get_thrust_policy(),
                  (*renumber_map_label_offsets).begin(),
                  (*renumber_map_label_offsets).end(),
@@ -725,8 +827,6 @@ renumber_sampled_edgelist(
     (*renumber_map_label_indices).resize(0, handle.get_stream());
     (*renumber_map_label_indices).shrink_to_fit(handle.get_stream());
 
-    auto num_labels = std::get<0>(*edgelist_label_offsets).size();
-
     rmm::device_uvector<vertex_t> segment_sorted_renumber_map(renumber_map.size(),
                                                               handle.get_stream());
     rmm::device_uvector<vertex_t> segment_sorted_new_vertices(new_vertices.size(),
@@ -785,8 +885,8 @@ renumber_sampled_edgelist(
     new_vertices.shrink_to_fit(handle.get_stream());
     d_tmp_storage.shrink_to_fit(handle.get_stream());
 
-    auto edgelist_label_indices = detail::expand_sparse_offsets(
-      std::get<0>(*edgelist_label_offsets), label_index_t{0}, handle.get_stream());
+    auto edgelist_label_indices =
+      detail::expand_sparse_offsets(*edgelist_label_offsets, label_index_t{0}, handle.get_stream());
 
     auto pair_first =
       thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_label_indices.begin());
@@ -887,16 +987,15 @@ std::tuple<rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_id_t>>,
            std::optional<rmm::device_uvector<edge_type_t>>,
-           std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>>
-sort_sampled_edge_tuples(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>&& edgelist_majors,
-  rmm::device_uvector<vertex_t>&& edgelist_minors,
-  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets)
+           std::optional<rmm::device_uvector<int32_t>>>
+sort_sampled_edge_tuples(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>&& edgelist_majors,
+                         rmm::device_uvector<vertex_t>&& edgelist_minors,
+                         std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+                         std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+                         std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+                         std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                         std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
 {
   std::vector<size_t> h_label_offsets{};
   std::vector<size_t> h_edge_offsets{};
@@ -906,11 +1005,8 @@ sort_sampled_edge_tuples(
       static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
       (1 << 20) /* tuning parameter */;  // for sorts in chunks
 
-    std::tie(h_label_offsets, h_edge_offsets) =
-      detail::compute_offset_aligned_element_chunks(handle,
-                                                    std::get<0>(*edgelist_label_offsets),
-                                                    edgelist_majors.size(),
-                                                    approx_edges_to_sort_per_iteration);
+    std::tie(h_label_offsets, h_edge_offsets) = detail::compute_offset_aligned_element_chunks(
+      handle, *edgelist_label_offsets, edgelist_majors.size(), approx_edges_to_sort_per_iteration);
   } else {
     h_label_offsets = {0, 1};
     h_edge_offsets  = {0, edgelist_majors.size()};
@@ -923,11 +1019,11 @@ sort_sampled_edge_tuples(
     thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
     edge_order_t<vertex_t, weight_t, edge_id_t, edge_type_t> edge_order_comp{
       edgelist_label_offsets ? thrust::make_optional<raft::device_span<size_t const>>(
-                                 std::get<0>(*edgelist_label_offsets).data() + h_label_offsets[i],
+                                 (*edgelist_label_offsets).data() + h_label_offsets[i],
                                  (h_label_offsets[i + 1] - h_label_offsets[i]) + 1)
                              : thrust::nullopt,
       edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
-                        std::get<0>(*edgelist_hops).data() + h_edge_offsets[i], indices.size())
+                        (*edgelist_hops).data() + h_edge_offsets[i], indices.size())
                     : thrust::nullopt,
       raft::device_span<vertex_t const>(edgelist_majors.data() + h_edge_offsets[i], indices.size()),
       raft::device_span<vertex_t const>(edgelist_minors.data() + h_edge_offsets[i],
@@ -956,10 +1052,8 @@ sort_sampled_edge_tuples(
     }
 
     if (edgelist_hops) {
-      permute_array(handle,
-                    indices.begin(),
-                    indices.end(),
-                    std::get<0>(*edgelist_hops).begin() + h_edge_offsets[i]);
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_hops).begin() + h_edge_offsets[i]);
     }
   }
 
@@ -994,8 +1088,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
   std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<vertex_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool compress_per_hop,
   bool doubly_compress,
@@ -1003,19 +1101,23 @@ renumber_and_compress_sampled_edgelist(
 {
   using label_index_t = uint32_t;
 
-  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
-  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
 
   // 1. check input arguments
 
   check_input_edges<label_index_t>(handle,
-                                   edgelist_srcs,
-                                   edgelist_dsts,
+                                   edgelist_majors,
+                                   edgelist_minors,
                                    edgelist_weights,
                                    edgelist_edge_ids,
                                    edgelist_edge_types,
                                    edgelist_hops,
+                                   seed_vertices,
+                                   seed_vertex_label_offsets,
                                    edgelist_label_offsets,
+                                   num_labels,
+                                   num_hops,
                                    do_expensive_check);
 
   CUGRAPH_EXPECTS(
@@ -1027,9 +1129,6 @@ renumber_and_compress_sampled_edgelist(
 
   // 2. renumber
 
-  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
-  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
-
   rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
   std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
   std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
@@ -1037,12 +1136,13 @@ renumber_and_compress_sampled_edgelist(
       handle,
       std::move(edgelist_majors),
       std::move(edgelist_minors),
-      edgelist_hops ? std::make_optional(std::make_tuple(
-                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
-                                                         std::get<0>(*edgelist_hops).size()),
-                        num_hops))
+      edgelist_hops ? std::make_optional(raft::device_span<int32_t const>((*edgelist_hops).data(),
+                                                                          (*edgelist_hops).size()))
                     : std::nullopt,
+      seed_vertices,
+      seed_vertex_label_offsets,
       edgelist_label_offsets,
+      num_labels,
       do_expensive_check);
 
   // 3. sort by ((l), (h), major, minor)
@@ -1069,10 +1169,9 @@ renumber_and_compress_sampled_edgelist(
       auto label_index_first = thrust::make_transform_iterator(
         thrust::make_counting_iterator(size_t{0}),
         optionally_compute_label_index_t<label_index_t>{
-          edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+          edgelist_label_offsets ? thrust::make_optional(*edgelist_label_offsets)
                                  : thrust::nullopt});
-      auto input_key_first =
-        thrust::make_zip_iterator(label_index_first, std::get<0>(*edgelist_hops).begin());
+      auto input_key_first = thrust::make_zip_iterator(label_index_first, (*edgelist_hops).begin());
       rmm::device_uvector<label_index_t> unique_key_label_indices(min_vertices.size(),
                                                                   handle.get_stream());
       rmm::device_uvector<int32_t> unique_key_hops(min_vertices.size(), handle.get_stream());
@@ -1133,10 +1232,9 @@ renumber_and_compress_sampled_edgelist(
     thrust::make_counting_iterator(size_t{0}),
     thrust::make_counting_iterator(edgelist_majors.size()),
     is_first_in_run_t<vertex_t>{
-      edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
-                             : thrust::nullopt,
+      detail::to_thrust_optional(edgelist_label_offsets),
       edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
-                        std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                        (*edgelist_hops).data(), (*edgelist_hops).size())
                     : thrust::nullopt,
       raft::device_span<vertex_t const>(
         edgelist_majors.data(),
@@ -1156,11 +1254,11 @@ renumber_and_compress_sampled_edgelist(
   if (edgelist_label_offsets) {
     auto label_index_first = thrust::make_transform_iterator(
       thrust::make_counting_iterator(size_t{0}),
-      compute_label_index_t<label_index_t>{std::get<0>(*edgelist_label_offsets)});
+      compute_label_index_t<label_index_t>{*edgelist_label_offsets});
 
     if (edgelist_hops) {
       auto input_key_first = thrust::make_zip_iterator(
-        label_index_first, std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+        label_index_first, (*edgelist_hops).begin(), edgelist_majors.begin());
       auto output_key_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
                                                         (*compressed_hops).begin(),
                                                         compressed_nzd_vertices.begin());
@@ -1184,7 +1282,7 @@ renumber_and_compress_sampled_edgelist(
   } else {
     if (edgelist_hops) {
       auto input_key_first =
-        thrust::make_zip_iterator(std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+        thrust::make_zip_iterator((*edgelist_hops).begin(), edgelist_majors.begin());
       auto output_key_first =
         thrust::make_zip_iterator((*compressed_hops).begin(), compressed_nzd_vertices.begin());
       thrust::reduce_by_key(handle.get_thrust_policy(),
@@ -1218,10 +1316,7 @@ renumber_and_compress_sampled_edgelist(
     if (edgelist_label_offsets || edgelist_hops) {
       rmm::device_uvector<size_t> offset_array_offsets(num_labels * num_hops + 1,
                                                        handle.get_stream());
-      thrust::fill(handle.get_thrust_policy(),
-                   offset_array_offsets.begin(),
-                   offset_array_offsets.end(),
-                   size_t{0});
+      offset_array_offsets.set_element_to_zero_async(0, handle.get_stream());
 
       if (edgelist_label_offsets) {
         if (edgelist_hops) {
@@ -1266,13 +1361,10 @@ renumber_and_compress_sampled_edgelist(
       handle.get_thrust_policy(),
       major_vertex_counts.begin(),
       major_vertex_counts.end(),
-      [edgelist_label_offsets = edgelist_label_offsets
-                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
-                                  : thrust::nullopt,
-       edgelist_hops          = edgelist_hops
-                                  ? thrust::make_optional<raft::device_span<int32_t>>(
-                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
-                                  : thrust::nullopt,
+      [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
+       edgelist_hops          = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t>>(
+                                         (*edgelist_hops).data(), (*edgelist_hops).size())
+                                              : thrust::nullopt,
        edgelist_majors =
          raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
        num_hops,
@@ -1550,33 +1642,38 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
   std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<vertex_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool do_expensive_check)
 {
   using label_index_t = uint32_t;
 
-  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
-  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
 
   // 1. check input arguments
 
   check_input_edges<label_index_t>(handle,
-                                   edgelist_srcs,
-                                   edgelist_dsts,
+                                   edgelist_majors,
+                                   edgelist_minors,
                                    edgelist_weights,
                                    edgelist_edge_ids,
                                    edgelist_edge_types,
                                    edgelist_hops,
+                                   seed_vertices,
+                                   seed_vertex_label_offsets,
                                    edgelist_label_offsets,
+                                   num_labels,
+                                   num_hops,
                                    do_expensive_check);
 
   // 2. renumber
 
-  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
-  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
-
   rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
   std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
   std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
@@ -1584,12 +1681,13 @@ renumber_and_sort_sampled_edgelist(
       handle,
       std::move(edgelist_majors),
       std::move(edgelist_minors),
-      edgelist_hops ? std::make_optional(std::make_tuple(
-                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
-                                                         std::get<0>(*edgelist_hops).size()),
-                        num_hops))
+      edgelist_hops ? std::make_optional(raft::device_span<int32_t const>((*edgelist_hops).data(),
+                                                                          (*edgelist_hops).size()))
                     : std::nullopt,
+      seed_vertices,
+      seed_vertex_label_offsets,
       edgelist_label_offsets,
+      num_labels,
       do_expensive_check);
 
   // 3. sort by ((l), (h), major, minor)
@@ -1624,13 +1722,10 @@ renumber_and_sort_sampled_edgelist(
       thrust::make_counting_iterator(num_labels * num_hops),
       (*edgelist_label_hop_offsets).begin(),
       cuda::proclaim_return_type<size_t>(
-        [edgelist_label_offsets = edgelist_label_offsets
-                                    ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
-                                    : thrust::nullopt,
-         edgelist_hops =
-           edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
-                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
-                         : thrust::nullopt,
+        [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
+         edgelist_hops = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                                           (*edgelist_hops).data(), (*edgelist_hops).size())
+                                       : thrust::nullopt,
          num_hops,
          num_edges = edgelist_majors.size()] __device__(size_t i) {
           size_t start_offset{0};
@@ -1687,40 +1782,42 @@ std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
            std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
            std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
            std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>&& edgelist_srcs,
-  rmm::device_uvector<vertex_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major,
-  bool do_expensive_check)
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<vertex_t>&& edgelist_srcs,
+                      rmm::device_uvector<vertex_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major,
+                      bool do_expensive_check)
 {
   using label_index_t = uint32_t;
 
-  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
-  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
 
   // 1. check input arguments
 
-  check_input_edges<label_index_t>(handle,
-                                   edgelist_srcs,
-                                   edgelist_dsts,
-                                   edgelist_weights,
-                                   edgelist_edge_ids,
-                                   edgelist_edge_types,
-                                   edgelist_hops,
-                                   edgelist_label_offsets,
-                                   do_expensive_check);
+  check_input_edges<label_index_t, vertex_t>(handle,
+                                             edgelist_majors,
+                                             edgelist_minors,
+                                             edgelist_weights,
+                                             edgelist_edge_ids,
+                                             edgelist_edge_types,
+                                             edgelist_hops,
+                                             std::nullopt,
+                                             std::nullopt,
+                                             edgelist_label_offsets,
+                                             num_labels,
+                                             num_hops,
+                                             do_expensive_check);
 
   // 2. sort by ((l), (h), major, minor)
 
-  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
-  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
-
   std::tie(edgelist_majors,
            edgelist_minors,
            edgelist_weights,
@@ -1751,13 +1848,10 @@ sort_sampled_edgelist(
       thrust::make_counting_iterator(num_labels * num_hops),
       (*edgelist_label_hop_offsets).begin(),
       cuda::proclaim_return_type<size_t>(
-        [edgelist_label_offsets = edgelist_label_offsets
-                                    ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
-                                    : thrust::nullopt,
-         edgelist_hops =
-           edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
-                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
-                         : thrust::nullopt,
+        [edgelist_label_offsets = detail::to_thrust_optional(edgelist_label_offsets),
+         edgelist_hops = edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                                           (*edgelist_hops).data(), (*edgelist_hops).size())
+                                       : thrust::nullopt,
          num_hops,
          num_edges = edgelist_majors.size()] __device__(size_t i) {
           size_t start_offset{0};
diff --git a/cpp/src/sampling/sampling_post_processing_sg.cu b/cpp/src/sampling/sampling_post_processing_sg.cu
index 5a243c9cb6b..3c6734559ed 100644
--- a/cpp/src/sampling/sampling_post_processing_sg.cu
+++ b/cpp/src/sampling/sampling_post_processing_sg.cu
@@ -36,8 +36,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<float>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool compress_per_hop,
   bool doubly_compress,
@@ -59,8 +63,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<double>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool compress_per_hop,
   bool doubly_compress,
@@ -82,8 +90,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<float>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool compress_per_hop,
   bool doubly_compress,
@@ -105,8 +117,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<double>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool compress_per_hop,
   bool doubly_compress,
@@ -128,8 +144,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<float>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int64_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool compress_per_hop,
   bool doubly_compress,
@@ -151,8 +171,12 @@ renumber_and_compress_sampled_edgelist(
   std::optional<rmm::device_uvector<double>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int64_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool compress_per_hop,
   bool doubly_compress,
@@ -173,8 +197,12 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<float>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool do_expensive_check);
 
@@ -193,8 +221,12 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<double>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool do_expensive_check);
 
@@ -213,8 +245,12 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<float>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool do_expensive_check);
 
@@ -233,8 +269,12 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<double>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int32_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool do_expensive_check);
 
@@ -253,8 +293,12 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<float>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int64_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool do_expensive_check);
 
@@ -273,8 +317,12 @@ renumber_and_sort_sampled_edgelist(
   std::optional<rmm::device_uvector<double>>&& edgelist_weights,
   std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+  std::optional<raft::device_span<int64_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+  size_t num_labels,
+  size_t num_hops,
   bool src_is_major,
   bool do_expensive_check);
 
@@ -284,17 +332,18 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<size_t>>>
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>&& edgelist_srcs,
-  rmm::device_uvector<int32_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major,
-  bool do_expensive_check);
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<int32_t>&& edgelist_srcs,
+                      rmm::device_uvector<int32_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major,
+                      bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -302,17 +351,18 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<size_t>>>
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>&& edgelist_srcs,
-  rmm::device_uvector<int32_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major,
-  bool do_expensive_check);
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<int32_t>&& edgelist_srcs,
+                      rmm::device_uvector<int32_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major,
+                      bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -320,17 +370,18 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<int64_t>>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<size_t>>>
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>&& edgelist_srcs,
-  rmm::device_uvector<int32_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major,
-  bool do_expensive_check);
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<int32_t>&& edgelist_srcs,
+                      rmm::device_uvector<int32_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major,
+                      bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -338,17 +389,18 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<int64_t>>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<size_t>>>
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>&& edgelist_srcs,
-  rmm::device_uvector<int32_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major,
-  bool do_expensive_check);
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<int32_t>&& edgelist_srcs,
+                      rmm::device_uvector<int32_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major,
+                      bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -356,17 +408,18 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<int64_t>>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<size_t>>>
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>&& edgelist_srcs,
-  rmm::device_uvector<int64_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major,
-  bool do_expensive_check);
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<int64_t>&& edgelist_srcs,
+                      rmm::device_uvector<int64_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major,
+                      bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -374,16 +427,17 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<int64_t>>,
                     std::optional<rmm::device_uvector<int32_t>>,
                     std::optional<rmm::device_uvector<size_t>>>
-sort_sampled_edgelist(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>&& edgelist_srcs,
-  rmm::device_uvector<int64_t>&& edgelist_dsts,
-  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
-  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
-  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
-  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
-  bool src_is_major,
-  bool do_expensive_check);
+sort_sampled_edgelist(raft::handle_t const& handle,
+                      rmm::device_uvector<int64_t>&& edgelist_srcs,
+                      rmm::device_uvector<int64_t>&& edgelist_dsts,
+                      std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+                      std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                      std::optional<rmm::device_uvector<int32_t>>&& edgelist_hops,
+                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
+                      size_t num_labels,
+                      size_t num_hops,
+                      bool src_is_major,
+                      bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index ce45ea08162..3e5b0a09179 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -548,11 +548,9 @@ class Tests_SamplingPostProcessing
       std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_sorted_edgelist_edge_types{
         std::nullopt};
       auto renumbered_and_sorted_edgelist_hops =
-        org_edgelist_hops
-          ? std::make_optional(std::make_tuple(
-              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
-              sampling_post_processing_usecase.fanouts.size()))
-          : std::nullopt;
+        org_edgelist_hops ? std::make_optional(rmm::device_uvector<int32_t>(
+                              (*org_edgelist_hops).size(), handle.get_stream()))
+                          : std::nullopt;
 
       raft::copy(renumbered_and_sorted_edgelist_srcs.data(),
                  org_edgelist_srcs.data(),
@@ -569,7 +567,7 @@ class Tests_SamplingPostProcessing
                    handle.get_stream());
       }
       if (renumbered_and_sorted_edgelist_hops) {
-        raft::copy(std::get<0>(*renumbered_and_sorted_edgelist_hops).data(),
+        raft::copy((*renumbered_and_sorted_edgelist_hops).data(),
                    (*org_edgelist_hops).data(),
                    (*org_edgelist_hops).size(),
                    handle.get_stream());
@@ -594,7 +592,7 @@ class Tests_SamplingPostProcessing
                renumbered_and_sorted_edgelist_label_hop_offsets,
                renumbered_and_sorted_renumber_map,
                renumbered_and_sorted_renumber_map_label_offsets) =
-        cugraph::renumber_and_sort_sampled_edgelist(
+        cugraph::renumber_and_sort_sampled_edgelist<vertex_t, weight_t, edge_id_t, edge_type_t>(
           handle,
           std::move(renumbered_and_sorted_edgelist_srcs),
           std::move(renumbered_and_sorted_edgelist_dsts),
@@ -602,12 +600,14 @@ class Tests_SamplingPostProcessing
           std::move(renumbered_and_sorted_edgelist_edge_ids),
           std::move(renumbered_and_sorted_edgelist_edge_types),
           std::move(renumbered_and_sorted_edgelist_hops),
+          std::nullopt,
+          std::nullopt,
           org_edgelist_label_offsets
-            ? std::make_optional(std::make_tuple(
-                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
-                                                (*org_edgelist_label_offsets).size()),
-                sampling_post_processing_usecase.num_labels))
+            ? std::make_optional(raft::device_span<size_t const>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
             : std::nullopt,
+          sampling_post_processing_usecase.num_labels,
+          sampling_post_processing_usecase.fanouts.size(),
           sampling_post_processing_usecase.src_is_major);
 
       if (cugraph::test::g_perf) {
@@ -786,11 +786,9 @@ class Tests_SamplingPostProcessing
       std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_compressed_edgelist_edge_types{
         std::nullopt};
       auto renumbered_and_compressed_edgelist_hops =
-        org_edgelist_hops
-          ? std::make_optional(std::make_tuple(
-              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
-              sampling_post_processing_usecase.fanouts.size()))
-          : std::nullopt;
+        org_edgelist_hops ? std::make_optional(rmm::device_uvector<int32_t>(
+                              (*org_edgelist_hops).size(), handle.get_stream()))
+                          : std::nullopt;
 
       raft::copy(renumbered_and_compressed_edgelist_srcs.data(),
                  org_edgelist_srcs.data(),
@@ -807,7 +805,7 @@ class Tests_SamplingPostProcessing
                    handle.get_stream());
       }
       if (renumbered_and_compressed_edgelist_hops) {
-        raft::copy(std::get<0>(*renumbered_and_compressed_edgelist_hops).data(),
+        raft::copy((*renumbered_and_compressed_edgelist_hops).data(),
                    (*org_edgelist_hops).data(),
                    (*org_edgelist_hops).size(),
                    handle.get_stream());
@@ -838,7 +836,7 @@ class Tests_SamplingPostProcessing
                renumbered_and_compressed_offset_label_hop_offsets,
                renumbered_and_compressed_renumber_map,
                renumbered_and_compressed_renumber_map_label_offsets) =
-        cugraph::renumber_and_compress_sampled_edgelist(
+        cugraph::renumber_and_compress_sampled_edgelist<vertex_t, weight_t, edge_id_t, edge_type_t>(
           handle,
           std::move(renumbered_and_compressed_edgelist_srcs),
           std::move(renumbered_and_compressed_edgelist_dsts),
@@ -846,12 +844,14 @@ class Tests_SamplingPostProcessing
           std::move(renumbered_and_compressed_edgelist_edge_ids),
           std::move(renumbered_and_compressed_edgelist_edge_types),
           std::move(renumbered_and_compressed_edgelist_hops),
+          std::nullopt,
+          std::nullopt,
           org_edgelist_label_offsets
-            ? std::make_optional(std::make_tuple(
-                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
-                                                (*org_edgelist_label_offsets).size()),
-                sampling_post_processing_usecase.num_labels))
+            ? std::make_optional(raft::device_span<size_t const>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
             : std::nullopt,
+          sampling_post_processing_usecase.num_labels,
+          sampling_post_processing_usecase.fanouts.size(),
           sampling_post_processing_usecase.src_is_major,
           sampling_post_processing_usecase.compress_per_hop,
           sampling_post_processing_usecase.doubly_compress);
@@ -1106,12 +1106,10 @@ class Tests_SamplingPostProcessing
                                        : std::nullopt;
       std::optional<rmm::device_uvector<edge_id_t>> sorted_edgelist_edge_ids{std::nullopt};
       std::optional<rmm::device_uvector<edge_type_t>> sorted_edgelist_edge_types{std::nullopt};
-      auto sorted_edgelist_hops =
-        org_edgelist_hops
-          ? std::make_optional(std::make_tuple(
-              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
-              sampling_post_processing_usecase.fanouts.size()))
-          : std::nullopt;
+      auto sorted_edgelist_hops = org_edgelist_hops
+                                    ? std::make_optional(rmm::device_uvector<int32_t>(
+                                        (*org_edgelist_hops).size(), handle.get_stream()))
+                                    : std::nullopt;
 
       raft::copy(sorted_edgelist_srcs.data(),
                  org_edgelist_srcs.data(),
@@ -1128,7 +1126,7 @@ class Tests_SamplingPostProcessing
                    handle.get_stream());
       }
       if (sorted_edgelist_hops) {
-        raft::copy(std::get<0>(*sorted_edgelist_hops).data(),
+        raft::copy((*sorted_edgelist_hops).data(),
                    (*org_edgelist_hops).data(),
                    (*org_edgelist_hops).size(),
                    handle.get_stream());
@@ -1147,7 +1145,7 @@ class Tests_SamplingPostProcessing
                sorted_edgelist_edge_ids,
                sorted_edgelist_edge_types,
                sorted_edgelist_label_hop_offsets) =
-        cugraph::sort_sampled_edgelist(
+        cugraph::sort_sampled_edgelist<vertex_t, weight_t, edge_id_t, edge_type_t>(
           handle,
           std::move(sorted_edgelist_srcs),
           std::move(sorted_edgelist_dsts),
@@ -1156,11 +1154,11 @@ class Tests_SamplingPostProcessing
           std::move(sorted_edgelist_edge_types),
           std::move(sorted_edgelist_hops),
           org_edgelist_label_offsets
-            ? std::make_optional(std::make_tuple(
-                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
-                                                (*org_edgelist_label_offsets).size()),
-                sampling_post_processing_usecase.num_labels))
+            ? std::make_optional(raft::device_span<size_t const>(
+                (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
             : std::nullopt,
+          sampling_post_processing_usecase.num_labels,
+          sampling_post_processing_usecase.fanouts.size(),
           sampling_post_processing_usecase.src_is_major);
 
       if (cugraph::test::g_perf) {

From f4bcf7248692e94409610d40c790395508ed32bd Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Sat, 6 Apr 2024 18:28:30 -0400
Subject: [PATCH 03/80] fix tests for pyg 2.5

---
 .../cugraph_pyg/nn/conv/hetero_gat_conv.py    | 11 +++---
 .../cugraph_pyg/tests/nn/test_gat_conv.py     | 13 +++----
 .../tests/nn/test_hetero_gat_conv.py          | 36 ++++++++++---------
 .../cugraph-pyg/cugraph_pyg/utils/__init__.py | 12 +++++++
 .../cugraph-pyg/cugraph_pyg/utils/imports.py  | 32 +++++++++++++++++
 5 files changed, 76 insertions(+), 28 deletions(-)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/utils/__init__.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/utils/imports.py

diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
index 3b717552a96..6b648c1b77a 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 from pylibcugraphops.pytorch.operators import mha_gat_n2n
 
 from .base import BaseConv
+from cugraph_pyg.utils.imports import package_available
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
@@ -74,10 +75,10 @@ def __init__(
         bias: bool = True,
         aggr: str = "sum",
     ):
-        major, minor, patch = torch_geometric.__version__.split(".")[:3]
-        pyg_version = tuple(map(int, [major, minor, patch]))
-        if pyg_version < (2, 4, 0):
-            raise RuntimeError(f"{self.__class__.__name__} requires pyg >= 2.4.0.")
+        if not package_available("torch_geometric>=2.4.0"):
+            raise RuntimeError(
+                f"{self.__class__.__name__} requires torch_geometric>=2.4.0."
+            )
 
         super().__init__()
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index 62bebb9211d..3a34372cdce 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,10 +14,14 @@
 import pytest
 
 from cugraph_pyg.nn import GATConv as CuGraphGATConv
+from cugraph_pyg.utils.imports import package_available
 
 ATOL = 1e-6
 
 
+@pytest.mark.skipif(
+    package_available("torch_geometric<2.5"), reason="Test requires pyg>=2.5"
+)
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
@@ -28,7 +32,6 @@
 def test_gat_conv_equality(
     bias, bipartite, concat, heads, max_num_neighbors, use_edge_attr, graph, request
 ):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
     from torch_geometric.nn import GATConv
 
@@ -71,7 +74,7 @@ def test_gat_conv_equality(
             conv2.lin_src.weight.data = conv1.lin_src.weight.data.detach().clone()
             conv2.lin_dst.weight.data = conv1.lin_dst.weight.data.detach().clone()
         else:
-            conv2.lin.weight.data = conv1.lin_src.weight.data.detach().clone()
+            conv2.lin.weight.data = conv1.lin.weight.data.detach().clone()
 
         conv2.att.data[:out_dim] = conv1.att_src.data.flatten()
         conv2.att.data[out_dim : 2 * out_dim] = conv1.att_dst.data.flatten()
@@ -95,9 +98,7 @@ def test_gat_conv_equality(
             conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
         )
     else:
-        assert torch.allclose(
-            conv1.lin_src.weight.grad, conv2.lin.weight.grad, atol=ATOL
-        )
+        assert torch.allclose(conv1.lin.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
     assert torch.allclose(
         conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=ATOL
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
index 1c841a17df7..e1029849cb0 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,31 +14,23 @@
 import pytest
 
 from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
+from cugraph_pyg.utils.imports import package_available
 
 ATOL = 1e-6
 
 
 @pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(
-    isinstance(torch_geometric, MissingModule), reason="torch_geometric not available"
+    package_available("torch_geometric<2.4"), reason="Test requires pyg>=2.4"
 )
 @pytest.mark.parametrize("heads", [1, 3, 10])
 @pytest.mark.parametrize("aggr", ["sum", "mean"])
 def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
-    major, minor, patch = torch_geometric.__version__.split(".")[:3]
-    pyg_version = tuple(map(int, [major, minor, patch]))
-    if pyg_version < (2, 4, 0):
-        pytest.skip("Skipping HeteroGATConv test")
-
+    import torch
     from torch_geometric.data import HeteroData
     from torch_geometric.nn import HeteroConv, GATConv
 
-    device = torch.device("cuda:0")
+    device = torch.device("cuda")
     data = HeteroData(sample_pyg_hetero_data).to(device)
 
     in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
@@ -73,9 +65,15 @@ def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
     with torch.no_grad():
         for edge_type in conv2.edge_types:
             src_t, _, dst_t = edge_type
-            w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
-            if w_dst[edge_type] is not None:
-                w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
+            if src_t == dst_t:
+                w_src[edge_type][:, :] = conv1.convs[edge_type].lin.weight[:, :]
+            else:
+                w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
+                if w_dst[edge_type] is not None:
+                    w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
+            # w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
+            # if w_dst[edge_type] is not None:
+            #     w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
 
             conv2.attn_weights[edge_type][: heads * out_channels] = conv1.convs[
                 edge_type
@@ -118,7 +116,11 @@ def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
     for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
         grad_list = []
         for rel_t in rels_as_src:
-            grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
+            src_type, _, dst_type = rel_t
+            if src_type == dst_type:
+                grad_list.append(conv1.convs[rel_t].lin.weight.grad.clone())
+            else:
+                grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
         for rel_t in rels_as_dst:
             grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
         assert len(grad_list) > 0
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/__init__.py b/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
new file mode 100644
index 00000000000..aeae6078111
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/imports.py b/python/cugraph-pyg/cugraph_pyg/utils/imports.py
new file mode 100644
index 00000000000..1cc865a1f35
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/utils/imports.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from packaging.requirements import Requirement
+from importlib import import_module
+
+
+def package_available(requirement: str) -> bool:
+    """Check if a package is installed and meets the version requirement."""
+    req = Requirement(requirement)
+    try:
+        pkg = import_module(req.name)
+    except ImportError:
+        return False
+
+    if len(req.specifier) > 0:
+        if hasattr(pkg, "__version__"):
+            return pkg.__version__ in req.specifier
+        else:
+            return False
+
+    return True

From e13ec051ce68ed98e877e656fa0df8ea6273aea2 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 8 Apr 2024 10:46:44 -0700
Subject: [PATCH 04/80] update renumbering to consider seed vertices

---
 .../sampling_post_processing_impl.cuh         | 622 ++++++++++++++----
 1 file changed, 492 insertions(+), 130 deletions(-)

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 5060b283659..ed8d9ba0cf1 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -49,7 +49,7 @@ namespace cugraph {
 
 namespace {
 
-template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+template <typename vertex_t>
 struct edge_order_t {
   thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
   thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
@@ -59,6 +59,7 @@ struct edge_order_t {
   __device__ bool operator()(size_t l_idx, size_t r_idx) const
   {
     if (edgelist_label_offsets) {
+      // FIXME: (*edgelist_label_offsets)[0] == always 0???
       auto l_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
                                       thrust::upper_bound(thrust::seq,
                                                           (*edgelist_label_offsets).begin() + 1,
@@ -172,11 +173,9 @@ void check_input_edges(raft::handle_t const& handle,
                   "Invalid input arguments: num_labels should be a positive integer and the "
                   "current implementation assumes that the number of unique labels is no larger "
                   "than std::numeric_limits<uint32_t>::max().");
-  CUGRAPH_EXPECTS(
-    ((num_labels == 1) && !edgelist_label_offsets.has_value()) ||
-      (num_labels >= 2 && edgelist_label_offsets.has_value()),
-    "Invalid input arguments: edgelist_label_offsets should be std::nullopt if num_labels == 1 and "
-    "edgelist_label_offsets.has_value() should be true if num_labels >= 2.");
+  CUGRAPH_EXPECTS((num_labels == 1) || edgelist_label_offsets.has_value(),
+                  "Invalid input arguments: edgelist_label_offsets.has_value() should be true if "
+                  "num_labels >= 2.");
   CUGRAPH_EXPECTS(
     !edgelist_label_offsets.has_value() || ((*edgelist_label_offsets).size() == num_labels + 1),
     "Invalid input arguments: if edgelist_label_offsets is valid, (*edgelist_label_offsets).size() "
@@ -187,16 +186,16 @@ void check_input_edges(raft::handle_t const& handle,
     "Invalid input arguments: num_hops should be a positive integer and the current implementation "
     "assumes that the number of hops is no larger than std::numeric_limits<int32_t>::max().");
   CUGRAPH_EXPECTS(
-    ((num_hops == 1) && !edgelist_hops.has_value()) || (num_hops >= 2 && edgelist_hops.has_value()),
-    "Invalid input arguments: edgelist_hops should be std::nullopt if num_hops == 1 and "
-    "edgelist_hops.has_value() should be true if num_hops >= 2.");
-
-  CUGRAPH_EXPECTS(
-    ((!seed_vertices.has_value() || (num_labels == 1)) && !seed_vertex_label_offsets.has_value()) ||
-      ((seed_vertices.has_value() && (num_labels >= 2)) && seed_vertex_label_offsets.has_value()),
-    "Invaild input arguments: if seed_vertices.has_value() is true and num_labels >= 2, "
-    "seed_vertex_label_offsets.has_value() should be true. Otherwise, "
-    "seed_vertex_label_offsets.has_value() should be false.");
+    (num_hops == 1) || edgelist_hops.has_value(),
+    "Invalid input arguments: edgelist_hops.has_value() should be true if num_hops >= 2.");
+
+  CUGRAPH_EXPECTS((!seed_vertices.has_value() && !seed_vertex_label_offsets.has_value()) ||
+                    (seed_vertices.has_value() &&
+                     (edgelist_label_offsets.has_value() == seed_vertex_label_offsets.has_value())),
+                  "Invaild input arguments: if seed_vertices.has_value() is false, "
+                  "seed_vertex_label_offsets.has_value() should be false as well. If "
+                  "seed_vertices.has_value( ) is true, seed_vertex_label_offsets.has_value() "
+                  "should coincide with edgelist_label_offsets.has_value().");
   CUGRAPH_EXPECTS(
     !seed_vertex_label_offsets.has_value() ||
       ((*seed_vertex_label_offsets).size() == num_labels + 1),
@@ -222,6 +221,8 @@ void check_input_edges(raft::handle_t const& handle,
                   "Invalid input arguments: if edgelist_hops is valid, (*edgelist_hops).size() and "
                   "edgelist_(srcs|dsts).size() should coincide.");
 
+  // FIXME: should check edgelist_hops elements are non-decreasing within each label? Or this
+  // requirement is not necessary?
   if (do_expensive_check) {
     if (edgelist_label_offsets) {
       CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),
@@ -229,12 +230,19 @@ void check_input_edges(raft::handle_t const& handle,
                                         (*edgelist_label_offsets).end()),
                       "Invalid input arguments: if edgelist_label_offsets is valid, "
                       "*edgelist_label_offsets should be sorted.");
+      size_t front_element{};
+      raft::update_host(
+        &front_element, (*edgelist_label_offsets).data(), size_t{1}, handle.get_stream());
       size_t back_element{};
       raft::update_host(&back_element,
                         (*edgelist_label_offsets).data() + num_labels,
                         size_t{1},
                         handle.get_stream());
       handle.sync_stream();
+      CUGRAPH_EXPECTS(
+        front_element == size_t{0},
+        "Invalid input arguments: if edgelist_label_offsets is valid, the first element of "
+        "*edgelist_label_offsets should be 0.");
       CUGRAPH_EXPECTS(
         back_element == edgelist_majors.size(),
         "Invalid input arguments: if edgelist_label_offsets is valid, the last element of "
@@ -334,71 +342,121 @@ std::tuple<std::optional<rmm::device_uvector<label_index_t>> /* label indices */
            std::optional<rmm::device_uvector<size_t>> /* label offsets for the output */>
 compute_min_hop_for_unique_label_vertex_pairs(
   raft::handle_t const& handle,
-  raft::device_span<vertex_t const> vertices,
-  std::optional<raft::device_span<int32_t const>> hops,
-  std::optional<raft::device_span<label_index_t const>> label_indices,
-  std::optional<raft::device_span<size_t const>> label_offsets)
+  raft::device_span<vertex_t const> edgelist_vertices,
+  std::optional<raft::device_span<int32_t const>> edgelist_hops,
+  std::optional<raft::device_span<vertex_t const>> seed_vertices,
+  std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
+  std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
 {
   auto approx_edges_to_sort_per_iteration =
     static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
-    (1 << 20) /* tuning parameter */;  // for segmented sort
-
-  if (label_indices) {
-    auto num_labels = (*label_offsets).size() - 1;
-
-    rmm::device_uvector<label_index_t> tmp_label_indices((*label_indices).size(),
-                                                         handle.get_stream());
-    thrust::copy(handle.get_thrust_policy(),
-                 (*label_indices).begin(),
-                 (*label_indices).end(),
-                 tmp_label_indices.begin());
+    (1 << 18) /* tuning parameter */;  // for segmented sort
 
+  if (edgelist_label_offsets) {
+    rmm::device_uvector<label_index_t> tmp_label_indices(0, handle.get_stream());
     rmm::device_uvector<vertex_t> tmp_vertices(0, handle.get_stream());
     std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
 
-    if (hops) {
-      // FIXME: why not use cub::DeviceSegmentedSort::SortPairs???
-      tmp_vertices.resize(vertices.size(), handle.get_stream());
-      thrust::copy(
-        handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
-      tmp_hops = rmm::device_uvector<int32_t>((*hops).size(), handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), (*tmp_hops).begin());
+    auto [h_label_offsets, h_edge_offsets] =
+      detail::compute_offset_aligned_element_chunks(handle,
+                                                    *edgelist_label_offsets,
+                                                    edgelist_vertices.size(),
+                                                    approx_edges_to_sort_per_iteration);
+    auto num_chunks = h_label_offsets.size() - 1;
 
-      auto triplet_first = thrust::make_zip_iterator(
-        tmp_label_indices.begin(), tmp_vertices.begin(), (*tmp_hops).begin());
-      thrust::sort(
-        handle.get_thrust_policy(), triplet_first, triplet_first + tmp_label_indices.size());
-      auto key_first   = thrust::make_zip_iterator(tmp_label_indices.begin(), tmp_vertices.begin());
-      auto num_uniques = static_cast<size_t>(
-        thrust::distance(key_first,
-                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                              key_first,
-                                                              key_first + tmp_label_indices.size(),
-                                                              (*tmp_hops).begin()))));
-      tmp_label_indices.resize(num_uniques, handle.get_stream());
-      tmp_vertices.resize(num_uniques, handle.get_stream());
-      (*tmp_hops).resize(num_uniques, handle.get_stream());
-      tmp_label_indices.shrink_to_fit(handle.get_stream());
-      tmp_vertices.shrink_to_fit(handle.get_stream());
-      (*tmp_hops).shrink_to_fit(handle.get_stream());
-    } else {
-      rmm::device_uvector<vertex_t> segment_sorted_vertices(vertices.size(), handle.get_stream());
+    if (edgelist_hops) {
+      rmm::device_uvector<size_t> tmp_indices(edgelist_vertices.size(), handle.get_stream());
+      thrust::sequence(
+        handle.get_thrust_policy(), tmp_indices.begin(), tmp_indices.end(), size_t{0});
 
-      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+      // cub::DeviceSegmentedSort currently does not suuport thrust::tuple type keys, sorting in
+      // chunks still helps in limiting the binary search range and improving memory locality
+      for (size_t i = 0; i < num_chunks; ++i) {
+        thrust::sort(
+          handle.get_thrust_policy(),
+          tmp_indices.begin() + h_edge_offsets[i],
+          tmp_indices.begin() + h_edge_offsets[i + 1],
+          [edgelist_label_offsets =
+             raft::device_span<size_t const>((*edgelist_label_offsets).data() + h_label_offsets[i],
+                                             (h_label_offsets[i + 1] - h_label_offsets[i]) + 1),
+           edgelist_vertices,
+           edgelist_hops = *edgelist_hops] __device__(size_t l_idx, size_t r_idx) {
+            auto l_it = thrust::upper_bound(
+              thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), l_idx);
+            auto r_it = thrust::upper_bound(
+              thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), r_idx);
+            if (l_it != r_it) { return l_it < r_it; }
+
+            auto l_vertex = edgelist_vertices[l_idx];
+            auto r_vertex = edgelist_vertices[r_idx];
+            if (l_vertex != r_vertex) { return l_vertex < r_vertex; }
+
+            auto l_hop = edgelist_hops[l_idx];
+            auto r_hop = edgelist_hops[r_idx];
+            return l_hop < r_hop;
+          });
+      }
 
-      auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_element_chunks(
-        handle, *label_offsets, vertices.size(), approx_edges_to_sort_per_iteration);
-      auto num_chunks = h_label_offsets.size() - 1;
+      tmp_indices.resize(
+        thrust::distance(
+          tmp_indices.begin(),
+          thrust::unique(handle.get_thrust_policy(),
+                         tmp_indices.begin(),
+                         tmp_indices.end(),
+                         [edgelist_label_offsets = *edgelist_label_offsets,
+                          edgelist_vertices,
+                          edgelist_hops = *edgelist_hops] __device__(size_t l_idx, size_t r_idx) {
+                           auto l_it = thrust::upper_bound(thrust::seq,
+                                                           edgelist_label_offsets.begin() + 1,
+                                                           edgelist_label_offsets.end(),
+                                                           l_idx);
+                           auto r_it = thrust::upper_bound(thrust::seq,
+                                                           edgelist_label_offsets.begin() + 1,
+                                                           edgelist_label_offsets.end(),
+                                                           r_idx);
+                           if (l_it != r_it) { return false; }
+
+                           auto l_vertex = edgelist_vertices[l_idx];
+                           auto r_vertex = edgelist_vertices[r_idx];
+                           return l_vertex == r_vertex;
+                         })),
+        handle.get_stream());
 
+      tmp_label_indices.resize(tmp_indices.size(), handle.get_stream());
+      tmp_vertices.resize(tmp_indices.size(), handle.get_stream());
+      tmp_hops = rmm::device_uvector<int32_t>(tmp_indices.size(), handle.get_stream());
+
+      auto triplet_first = thrust::make_transform_iterator(
+        tmp_indices.begin(),
+        cuda::proclaim_return_type<thrust::tuple<label_index_t, vertex_t, int32_t>>(
+          [edgelist_label_offsets = *edgelist_label_offsets,
+           edgelist_vertices,
+           edgelist_hops = *edgelist_hops] __device__(size_t i) {
+            auto label_idx = static_cast<label_index_t>(thrust::distance(
+              edgelist_label_offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), i)));
+            return thrust::make_tuple(label_idx, edgelist_vertices[i], edgelist_hops[i]);
+          }));
+      thrust::copy(handle.get_thrust_policy(),
+                   triplet_first,
+                   triplet_first + tmp_indices.size(),
+                   thrust::make_zip_iterator(
+                     tmp_label_indices.begin(), tmp_vertices.begin(), (*tmp_hops).begin()));
+    } else {
+      rmm::device_uvector<vertex_t> segment_sorted_vertices(edgelist_vertices.size(),
+                                                            handle.get_stream());
+
+      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
       for (size_t i = 0; i < num_chunks; ++i) {
         size_t tmp_storage_bytes{0};
 
         auto offset_first =
-          thrust::make_transform_iterator((*label_offsets).data() + h_label_offsets[i],
+          thrust::make_transform_iterator((*edgelist_label_offsets).data() + h_label_offsets[i],
                                           detail::shift_left_t<size_t>{h_edge_offsets[i]});
         cub::DeviceSegmentedSort::SortKeys(static_cast<void*>(nullptr),
                                            tmp_storage_bytes,
-                                           vertices.begin() + h_edge_offsets[i],
+                                           edgelist_vertices.begin() + h_edge_offsets[i],
                                            segment_sorted_vertices.begin() + h_edge_offsets[i],
                                            h_edge_offsets[i + 1] - h_edge_offsets[i],
                                            h_label_offsets[i + 1] - h_label_offsets[i],
@@ -412,7 +470,7 @@ compute_min_hop_for_unique_label_vertex_pairs(
 
         cub::DeviceSegmentedSort::SortKeys(d_tmp_storage.data(),
                                            tmp_storage_bytes,
-                                           vertices.begin() + h_edge_offsets[i],
+                                           edgelist_vertices.begin() + h_edge_offsets[i],
                                            segment_sorted_vertices.begin() + h_edge_offsets[i],
                                            h_edge_offsets[i + 1] - h_edge_offsets[i],
                                            h_label_offsets[i + 1] - h_label_offsets[i],
@@ -423,27 +481,227 @@ compute_min_hop_for_unique_label_vertex_pairs(
       d_tmp_storage.resize(0, handle.get_stream());
       d_tmp_storage.shrink_to_fit(handle.get_stream());
 
-      auto pair_first =
-        thrust::make_zip_iterator(tmp_label_indices.begin(), segment_sorted_vertices.begin());
-      auto num_uniques = static_cast<size_t>(thrust::distance(
-        pair_first,
-        thrust::unique(
-          handle.get_thrust_policy(), pair_first, pair_first + tmp_label_indices.size())));
+      tmp_label_indices.resize(segment_sorted_vertices.size(), handle.get_stream());
+      tmp_vertices.resize(segment_sorted_vertices.size(), handle.get_stream());
+
+      auto input_pair_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<thrust::tuple<label_index_t, vertex_t>>(
+          [edgelist_label_offsets = *edgelist_label_offsets,
+           edgelist_vertices      = raft::device_span<vertex_t const>(
+             segment_sorted_vertices.data(), segment_sorted_vertices.size())] __device__(size_t i) {
+            auto label_idx = static_cast<label_index_t>(thrust::distance(
+              edgelist_label_offsets.begin() + 1,
+              thrust::upper_bound(
+                thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), i)));
+            return thrust::make_tuple(label_idx, edgelist_vertices[i]);
+          }));
+      auto output_pair_first =
+        thrust::make_zip_iterator(tmp_label_indices.begin(), tmp_vertices.begin());
+      auto num_uniques =
+        thrust::distance(output_pair_first,
+                         thrust::unique_copy(handle.get_thrust_policy(),
+                                             input_pair_first,
+                                             input_pair_first + segment_sorted_vertices.size(),
+                                             output_pair_first));
       tmp_label_indices.resize(num_uniques, handle.get_stream());
-      segment_sorted_vertices.resize(num_uniques, handle.get_stream());
+      tmp_vertices.resize(num_uniques, handle.get_stream());
       tmp_label_indices.shrink_to_fit(handle.get_stream());
-      segment_sorted_vertices.shrink_to_fit(handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+    }
+
+    if (seed_vertices) {
+      /* label segmented sort */
+
+      rmm::device_uvector<vertex_t> segment_sorted_vertices((*seed_vertices).size(),
+                                                            handle.get_stream());
+
+      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+      size_t tmp_storage_bytes{0};
+
+      cub::DeviceSegmentedSort::SortKeys(static_cast<void*>(nullptr),
+                                         tmp_storage_bytes,
+                                         (*seed_vertices).begin(),
+                                         segment_sorted_vertices.begin(),
+                                         (*seed_vertices).size(),
+                                         (*seed_vertex_label_offsets).size() - 1,
+                                         (*seed_vertex_label_offsets).begin(),
+                                         (*seed_vertex_label_offsets).begin() + 1,
+                                         handle.get_stream());
+
+      if (tmp_storage_bytes > d_tmp_storage.size()) {
+        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+      }
+
+      cub::DeviceSegmentedSort::SortKeys(d_tmp_storage.data(),
+                                         tmp_storage_bytes,
+                                         (*seed_vertices).begin(),
+                                         segment_sorted_vertices.begin(),
+                                         (*seed_vertices).size(),
+                                         (*seed_vertex_label_offsets).size() - 1,
+                                         (*seed_vertex_label_offsets).begin(),
+                                         (*seed_vertex_label_offsets).begin() + 1,
+                                         handle.get_stream());
+
+      /* enumerate unique (label, vertex) pairs */
+
+      rmm::device_uvector<label_index_t> unique_seed_vertex_label_indices((*seed_vertices).size(),
+                                                                          handle.get_stream());
+      rmm::device_uvector<vertex_t> unique_seed_vertices((*seed_vertices).size(),
+                                                         handle.get_stream());
+      auto input_pair_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        cuda::proclaim_return_type<thrust::tuple<label_index_t, vertex_t>>(
+          [seed_vertex_label_offsets = *seed_vertex_label_offsets,
+           seed_vertices             = raft::device_span<vertex_t const>(
+             segment_sorted_vertices.data(), segment_sorted_vertices.size())] __device__(size_t i) {
+            auto label_idx = static_cast<label_index_t>(
+              thrust::distance(seed_vertex_label_offsets.begin() + 1,
+                               thrust::upper_bound(thrust::seq,
+                                                   seed_vertex_label_offsets.begin() + 1,
+                                                   seed_vertex_label_offsets.end(),
+                                                   i)));
+            return thrust::make_tuple(label_idx, seed_vertices[i]);
+          }));
+      auto output_pair_first = thrust::make_zip_iterator(unique_seed_vertex_label_indices.begin(),
+                                                         unique_seed_vertices.begin());
+      auto num_uniques =
+        thrust::distance(output_pair_first,
+                         thrust::unique_copy(handle.get_thrust_policy(),
+                                             input_pair_first,
+                                             input_pair_first + segment_sorted_vertices.size(),
+                                             output_pair_first));
+      unique_seed_vertex_label_indices.resize(
+        thrust::distance(output_pair_first,
+                         thrust::unique_copy(handle.get_thrust_policy(),
+                                             input_pair_first,
+                                             input_pair_first + segment_sorted_vertices.size(),
+                                             output_pair_first)),
+        handle.get_stream());
+      unique_seed_vertices.resize(unique_seed_vertex_label_indices.size(), handle.get_stream());
+
+      /* merge with the (label, vertex, min. hop) triplets from the edgelist */
+
+      if (edgelist_hops) {
+        auto triplet_from_edgelist_first = thrust::make_zip_iterator(
+          tmp_label_indices.begin(), tmp_vertices.begin(), (*tmp_hops).begin());
+        auto key_pair_from_seed_vertex_first = thrust::make_zip_iterator(
+          unique_seed_vertex_label_indices.begin(), unique_seed_vertices.begin());
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          key_pair_from_seed_vertex_first,
+          key_pair_from_seed_vertex_first + unique_seed_vertex_label_indices.size(),
+          [triplet_from_edgelist_first,
+           triplet_from_edgelist_last =
+             triplet_from_edgelist_first + tmp_label_indices.size()] __device__(auto pair) {
+            auto it = thrust::lower_bound(
+              thrust::seq,
+              triplet_from_edgelist_first,
+              triplet_from_edgelist_last,
+              thrust::make_tuple(thrust::get<0>(pair), thrust::get<1>(pair), int32_t{0}));
+            if ((it != triplet_from_edgelist_last) &&
+                (thrust::get<0>(*it) == thrust::get<0>(pair)) &&
+                (thrust::get<1>(*it) == thrust::get<1>(pair))) {
+              // update min. hop to 0
+              if (thrust::get<2>(*it) != int32_t{0}) { thrust::get<2>(*it) = int32_t{0}; }
+            }
+          });
 
-      tmp_vertices = std::move(segment_sorted_vertices);
+        unique_seed_vertex_label_indices.resize(
+          thrust::distance(
+            key_pair_from_seed_vertex_first,
+            thrust::remove_if(
+              handle.get_thrust_policy(),
+              key_pair_from_seed_vertex_first,
+              key_pair_from_seed_vertex_first + unique_seed_vertices.size(),
+              [triplet_from_edgelist_first,
+               triplet_from_edgelist_last =
+                 triplet_from_edgelist_first + tmp_label_indices.size()] __device__(auto pair) {
+                auto it = thrust::lower_bound(
+                  thrust::seq,
+                  triplet_from_edgelist_first,
+                  triplet_from_edgelist_last,
+                  thrust::make_tuple(thrust::get<0>(pair), thrust::get<1>(pair), int32_t{0}));
+                return (it != triplet_from_edgelist_last) &&
+                       (thrust::get<0>(*it) == thrust::get<0>(pair)) &&
+                       (thrust::get<1>(*it) == thrust::get<1>(pair));
+              })),
+          handle.get_stream());
+        unique_seed_vertices.resize(unique_seed_vertex_label_indices.size(), handle.get_stream());
+        if (unique_seed_vertex_label_indices.size() > 0) {
+          rmm::device_uvector<label_index_t> merged_label_indices(
+            tmp_label_indices.size() + unique_seed_vertex_label_indices.size(),
+            handle.get_stream());
+          rmm::device_uvector<vertex_t> merged_vertices(merged_label_indices.size(),
+                                                        handle.get_stream());
+          rmm::device_uvector<int32_t> merged_hops(merged_label_indices.size(),
+                                                   handle.get_stream());
+          auto triplet_from_seed_vertex_first =
+            thrust::make_zip_iterator(unique_seed_vertex_label_indices.begin(),
+                                      unique_seed_vertices.begin(),
+                                      thrust::make_constant_iterator(int32_t{0}));
+          thrust::merge(
+            handle.get_thrust_policy(),
+            triplet_from_edgelist_first,
+            triplet_from_edgelist_first + tmp_label_indices.size(),
+            triplet_from_seed_vertex_first,
+            triplet_from_seed_vertex_first + unique_seed_vertex_label_indices.size(),
+            thrust::make_zip_iterator(
+              merged_label_indices.begin(), merged_vertices.begin(), merged_hops.begin()));
+          tmp_label_indices = std::move(merged_label_indices);
+          tmp_vertices      = std::move(merged_vertices);
+          tmp_hops          = std::move(merged_hops);
+        }
+      } else {
+        auto pair_from_edgelist_first =
+          thrust::make_zip_iterator(tmp_label_indices.begin(), tmp_vertices.begin());
+        auto pair_from_seed_vertex_first = thrust::make_zip_iterator(
+          unique_seed_vertex_label_indices.begin(), unique_seed_vertices.begin());
+        unique_seed_vertex_label_indices.resize(
+          thrust::distance(
+            pair_from_seed_vertex_first,
+            thrust::remove_if(
+              handle.get_thrust_policy(),
+              pair_from_seed_vertex_first,
+              pair_from_seed_vertex_first + unique_seed_vertex_label_indices.size(),
+              [pair_from_edgelist_first,
+               pair_from_edgelist_last =
+                 pair_from_edgelist_first + tmp_label_indices.size()] __device__(auto pair) {
+                auto it = thrust::lower_bound(
+                  thrust::seq, pair_from_edgelist_first, pair_from_edgelist_last, pair);
+                return (it != pair_from_edgelist_last) && (*it == pair);
+              })),
+          handle.get_stream());
+        unique_seed_vertices.resize(unique_seed_vertex_label_indices.size(), handle.get_stream());
+        if (unique_seed_vertex_label_indices.size() > 0) {
+          rmm::device_uvector<label_index_t> merged_label_indices(
+            tmp_label_indices.size() + unique_seed_vertex_label_indices.size(),
+            handle.get_stream());
+          rmm::device_uvector<vertex_t> merged_vertices(merged_label_indices.size(),
+                                                        handle.get_stream());
+          pair_from_seed_vertex_first = thrust::make_zip_iterator(
+            unique_seed_vertex_label_indices.begin(), unique_seed_vertices.begin());
+          thrust::merge(
+            handle.get_thrust_policy(),
+            pair_from_edgelist_first,
+            pair_from_edgelist_first + tmp_label_indices.size(),
+            pair_from_seed_vertex_first,
+            pair_from_seed_vertex_first + unique_seed_vertex_label_indices.size(),
+            thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin()));
+          tmp_label_indices = std::move(merged_label_indices);
+          tmp_vertices      = std::move(merged_vertices);
+        }
+      }
     }
 
-    rmm::device_uvector<size_t> tmp_label_offsets(num_labels + 1, handle.get_stream());
+    rmm::device_uvector<size_t> tmp_label_offsets((*edgelist_label_offsets).size(),
+                                                  handle.get_stream());
     tmp_label_offsets.set_element_to_zero_async(0, handle.get_stream());
     thrust::upper_bound(handle.get_thrust_policy(),
                         tmp_label_indices.begin(),
                         tmp_label_indices.end(),
                         thrust::make_counting_iterator(size_t{0}),
-                        thrust::make_counting_iterator(num_labels),
+                        thrust::make_counting_iterator(tmp_label_offsets.size() - 1),
                         tmp_label_offsets.begin() + 1);
 
     return std::make_tuple(std::move(tmp_label_indices),
@@ -451,28 +709,34 @@ compute_min_hop_for_unique_label_vertex_pairs(
                            std::move(tmp_hops),
                            std::move(tmp_label_offsets));
   } else {
-    rmm::device_uvector<vertex_t> tmp_vertices(vertices.size(), handle.get_stream());
-    thrust::copy(
-      handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
+    rmm::device_uvector<vertex_t> tmp_vertices(edgelist_vertices.size(), handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 edgelist_vertices.begin(),
+                 edgelist_vertices.end(),
+                 tmp_vertices.begin());
+    std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
 
-    if (hops) {
-      rmm::device_uvector<int32_t> tmp_hops((*hops).size(), handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), tmp_hops.begin());
+    if (edgelist_hops) {
+      tmp_hops = rmm::device_uvector<int32_t>((*edgelist_hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   (*edgelist_hops).begin(),
+                   (*edgelist_hops).end(),
+                   (*tmp_hops).begin());
 
       auto pair_first = thrust::make_zip_iterator(
-        tmp_vertices.begin(), tmp_hops.begin());  // vertex is a primary key, hop is a secondary key
+        tmp_vertices.begin(),
+        (*tmp_hops).begin());  // vertex is a primary key, hop is a secondary key
       thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_vertices.size());
       tmp_vertices.resize(
         thrust::distance(tmp_vertices.begin(),
                          thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
                                                               tmp_vertices.begin(),
                                                               tmp_vertices.end(),
-                                                              tmp_hops.begin()))),
+                                                              (*tmp_hops).begin()))),
         handle.get_stream());
-      tmp_hops.resize(tmp_vertices.size(), handle.get_stream());
-
-      return std::make_tuple(
-        std::nullopt, std::move(tmp_vertices), std::move(tmp_hops), std::nullopt);
+      (*tmp_hops).resize(tmp_vertices.size(), handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+      (*tmp_hops).shrink_to_fit(handle.get_stream());
     } else {
       thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end());
       tmp_vertices.resize(
@@ -481,9 +745,109 @@ compute_min_hop_for_unique_label_vertex_pairs(
           thrust::unique(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end())),
         handle.get_stream());
       tmp_vertices.shrink_to_fit(handle.get_stream());
+    }
 
-      return std::make_tuple(std::nullopt, std::move(tmp_vertices), std::nullopt, std::nullopt);
+    if (seed_vertices) {
+      /* sort and enumerate unique verties */
+
+      rmm::device_uvector<vertex_t> unique_seed_vertices((*seed_vertices).size(),
+                                                         handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   (*seed_vertices).begin(),
+                   (*seed_vertices).end(),
+                   unique_seed_vertices.begin());
+      thrust::sort(
+        handle.get_thrust_policy(), unique_seed_vertices.begin(), unique_seed_vertices.end());
+      unique_seed_vertices.resize(thrust::distance(unique_seed_vertices.begin(),
+                                                   thrust::unique(handle.get_thrust_policy(),
+                                                                  unique_seed_vertices.begin(),
+                                                                  unique_seed_vertices.end())),
+                                  handle.get_stream());
+
+      /* merge with the (vertex, min. hop) pairs from the edgelist */
+
+      if (edgelist_hops) {
+        auto pair_from_edgelist_first =
+          thrust::make_zip_iterator(tmp_vertices.begin(), (*tmp_hops).begin());
+        thrust::for_each(handle.get_thrust_policy(),
+                         unique_seed_vertices.begin(),
+                         unique_seed_vertices.end(),
+                         [pair_from_edgelist_first,
+                          pair_from_edgelist_last =
+                            pair_from_edgelist_first + tmp_vertices.size()] __device__(auto v) {
+                           auto it = thrust::lower_bound(thrust::seq,
+                                                         pair_from_edgelist_first,
+                                                         pair_from_edgelist_last,
+                                                         thrust::make_tuple(v, int32_t{0}));
+                           if ((it != pair_from_edgelist_last) && (thrust::get<0>(*it) == v)) {
+                             // update min. hop to 0
+                             if (thrust::get<1>(*it) != int32_t{0}) {
+                               thrust::get<1>(*it) = int32_t{0};
+                             }
+                           }
+                         });
+
+        unique_seed_vertices.resize(
+          thrust::distance(unique_seed_vertices.begin(),
+                           thrust::remove_if(
+                             handle.get_thrust_policy(),
+                             unique_seed_vertices.begin(),
+                             unique_seed_vertices.end(),
+                             [pair_from_edgelist_first,
+                              pair_from_edgelist_last =
+                                pair_from_edgelist_first + tmp_vertices.size()] __device__(auto v) {
+                               auto it = thrust::lower_bound(thrust::seq,
+                                                             pair_from_edgelist_first,
+                                                             pair_from_edgelist_last,
+                                                             thrust::make_tuple(v, int32_t{0}));
+                               return (it != pair_from_edgelist_last) && (thrust::get<0>(*it) == v);
+                             })),
+          handle.get_stream());
+        if (unique_seed_vertices.size() > 0) {
+          rmm::device_uvector<vertex_t> merged_vertices(
+            tmp_vertices.size() + unique_seed_vertices.size(), handle.get_stream());
+          rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+          auto pair_from_seed_vertex_first = thrust::make_zip_iterator(
+            unique_seed_vertices.begin(), thrust::make_constant_iterator(int32_t{0}));
+          thrust::merge(handle.get_thrust_policy(),
+                        pair_from_edgelist_first,
+                        pair_from_edgelist_first + tmp_vertices.size(),
+                        pair_from_seed_vertex_first,
+                        pair_from_seed_vertex_first + unique_seed_vertices.size(),
+                        thrust::make_zip_iterator(merged_vertices.begin(), merged_hops.begin()));
+          tmp_vertices = std::move(merged_vertices);
+          tmp_hops     = std::move(merged_hops);
+        }
+      } else {
+        unique_seed_vertices.resize(
+          thrust::distance(
+            unique_seed_vertices.begin(),
+            thrust::remove_if(handle.get_thrust_policy(),
+                              unique_seed_vertices.begin(),
+                              unique_seed_vertices.end(),
+                              [tmp_vertices = raft::device_span<vertex_t const>(
+                                 tmp_vertices.data(), tmp_vertices.size())] __device__(auto v) {
+                                auto it = thrust::lower_bound(
+                                  thrust::seq, tmp_vertices.begin(), tmp_vertices.end(), v);
+                                return (it != tmp_vertices.end()) && (*it == v);
+                              })),
+          handle.get_stream());
+        if (unique_seed_vertices.size() > 0) {
+          rmm::device_uvector<vertex_t> merged_vertices(
+            tmp_vertices.size() + unique_seed_vertices.size(), handle.get_stream());
+          thrust::merge(handle.get_thrust_policy(),
+                        tmp_vertices.begin(),
+                        tmp_vertices.end(),
+                        unique_seed_vertices.begin(),
+                        unique_seed_vertices.end(),
+                        merged_vertices.begin());
+          tmp_vertices = std::move(merged_vertices);
+        }
+      }
     }
+
+    return std::make_tuple(
+      std::nullopt, std::move(tmp_vertices), std::move(tmp_hops), std::nullopt);
   }
 }
 
@@ -497,46 +861,34 @@ compute_renumber_map(raft::handle_t const& handle,
                      std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
                      std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
 {
-  if (seed_vertices) { CUGRAPH_FAIL("unimplemented."); }
-
   auto approx_edges_to_sort_per_iteration =
     static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
     (1 << 20) /* tuning parameter */;  // for segmented sort
 
-  std::optional<rmm::device_uvector<label_index_t>> edgelist_label_indices{std::nullopt};
-  if (edgelist_label_offsets) {
-    edgelist_label_indices =
-      detail::expand_sparse_offsets(*edgelist_label_offsets, label_index_t{0}, handle.get_stream());
-  }
-
   auto [unique_label_major_pair_label_indices,
         unique_label_major_pair_vertices,
         unique_label_major_pair_hops,
         unique_label_major_pair_label_offsets] =
-    compute_min_hop_for_unique_label_vertex_pairs(
+    compute_min_hop_for_unique_label_vertex_pairs<vertex_t, label_index_t>(
       handle,
       edgelist_majors,
       edgelist_hops,
-      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
-                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
-                             : std::nullopt,
+      seed_vertices,
+      seed_vertex_label_offsets,
       edgelist_label_offsets);
 
   auto [unique_label_minor_pair_label_indices,
         unique_label_minor_pair_vertices,
         unique_label_minor_pair_hops,
         unique_label_minor_pair_label_offsets] =
-    compute_min_hop_for_unique_label_vertex_pairs(
+    compute_min_hop_for_unique_label_vertex_pairs<vertex_t, label_index_t>(
       handle,
       edgelist_minors,
       edgelist_hops,
-      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
-                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
-                             : std::nullopt,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      std::nullopt,
       edgelist_label_offsets);
 
-  edgelist_label_indices = std::nullopt;
-
   if (edgelist_label_offsets) {
     auto num_labels = (*edgelist_label_offsets).size() - 1;
 
@@ -744,8 +1096,8 @@ compute_renumber_map(raft::handle_t const& handle,
   }
 }
 
-// this function does not reorder edges (the i'th returned edge is the renumbered output of the i'th
-// input edge)
+// this function does not reorder edges (the i'th returned edge is the renumbered output of the
+// i'th input edge)
 template <typename vertex_t, typename label_index_t>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
@@ -885,27 +1237,30 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
     new_vertices.shrink_to_fit(handle.get_stream());
     d_tmp_storage.shrink_to_fit(handle.get_stream());
 
-    auto edgelist_label_indices =
-      detail::expand_sparse_offsets(*edgelist_label_offsets, label_index_t{0}, handle.get_stream());
-
     auto pair_first =
-      thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_label_indices.begin());
+      thrust::make_zip_iterator(edgelist_majors.begin(), thrust::make_counting_iterator(size_t{0}));
     thrust::transform(
       handle.get_thrust_policy(),
       pair_first,
       pair_first + edgelist_majors.size(),
       edgelist_majors.begin(),
-      [renumber_map_label_offsets = raft::device_span<size_t const>(
+      [edgelist_label_offsets     = *edgelist_label_offsets,
+       renumber_map_label_offsets = raft::device_span<size_t const>(
          (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
        old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
                                                         segment_sorted_renumber_map.size()),
        new_vertices = raft::device_span<vertex_t const>(
          segment_sorted_new_vertices.data(),
          segment_sorted_new_vertices.size())] __device__(auto pair) {
-        auto old_vertex         = thrust::get<0>(pair);
-        auto label_index        = thrust::get<1>(pair);
-        auto label_start_offset = renumber_map_label_offsets[label_index];
-        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto old_vertex = thrust::get<0>(pair);
+        auto label_idx  = static_cast<label_index_t>(
+          thrust::distance(edgelist_label_offsets.begin() + 1,
+                           thrust::upper_bound(thrust::seq,
+                                               edgelist_label_offsets.begin() + 1,
+                                               edgelist_label_offsets.end(),
+                                               thrust::get<1>(pair))));
+        auto label_start_offset = renumber_map_label_offsets[label_idx];
+        auto label_end_offset   = renumber_map_label_offsets[label_idx + 1];
         auto it                 = thrust::lower_bound(thrust::seq,
                                       old_vertices.begin() + label_start_offset,
                                       old_vertices.begin() + label_end_offset,
@@ -914,23 +1269,30 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
         return *(new_vertices.begin() + thrust::distance(old_vertices.begin(), it));
       });
 
-    pair_first = thrust::make_zip_iterator(edgelist_minors.begin(), edgelist_label_indices.begin());
+    pair_first =
+      thrust::make_zip_iterator(edgelist_minors.begin(), thrust::make_counting_iterator(size_t{0}));
     thrust::transform(
       handle.get_thrust_policy(),
       pair_first,
       pair_first + edgelist_minors.size(),
       edgelist_minors.begin(),
-      [renumber_map_label_offsets = raft::device_span<size_t const>(
+      [edgelist_label_offsets     = *edgelist_label_offsets,
+       renumber_map_label_offsets = raft::device_span<size_t const>(
          (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
        old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
                                                         segment_sorted_renumber_map.size()),
        new_vertices = raft::device_span<vertex_t const>(
          segment_sorted_new_vertices.data(),
          segment_sorted_new_vertices.size())] __device__(auto pair) {
-        auto old_vertex         = thrust::get<0>(pair);
-        auto label_index        = thrust::get<1>(pair);
-        auto label_start_offset = renumber_map_label_offsets[label_index];
-        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto old_vertex = thrust::get<0>(pair);
+        auto label_idx  = static_cast<label_index_t>(
+          thrust::distance(edgelist_label_offsets.begin() + 1,
+                           thrust::upper_bound(thrust::seq,
+                                               edgelist_label_offsets.begin() + 1,
+                                               edgelist_label_offsets.end(),
+                                               thrust::get<1>(pair))));
+        auto label_start_offset = renumber_map_label_offsets[label_idx];
+        auto label_end_offset   = renumber_map_label_offsets[label_idx + 1];
         auto it                 = thrust::lower_bound(thrust::seq,
                                       old_vertices.begin() + label_start_offset,
                                       old_vertices.begin() + label_end_offset,
@@ -1017,7 +1379,7 @@ sort_sampled_edge_tuples(raft::handle_t const& handle,
     rmm::device_uvector<size_t> indices(h_edge_offsets[i + 1] - h_edge_offsets[i],
                                         handle.get_stream());
     thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
-    edge_order_t<vertex_t, weight_t, edge_id_t, edge_type_t> edge_order_comp{
+    edge_order_t<vertex_t> edge_order_comp{
       edgelist_label_offsets ? thrust::make_optional<raft::device_span<size_t const>>(
                                  (*edgelist_label_offsets).data() + h_label_offsets[i],
                                  (h_label_offsets[i + 1] - h_label_offsets[i]) + 1)
@@ -1077,10 +1439,10 @@ std::tuple<std::optional<rmm::device_uvector<vertex_t>>,     // dcsr/dcsc major
            std::optional<rmm::device_uvector<weight_t>>,     // weights
            std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
            std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
-           std::optional<rmm::device_uvector<size_t>>,  // (label, hop) offsets to the (d)csr/(d)csc
-                                                        // offset array
-           rmm::device_uvector<vertex_t>,               // renumber map
-           std::optional<rmm::device_uvector<size_t>>>  // label offsets to the renumber map
+           std::optional<rmm::device_uvector<size_t>>,       // (label, hop) offsets to the
+                                                             // (d)csr/(d)csc offset array
+           rmm::device_uvector<vertex_t>,                    // renumber map
+           std::optional<rmm::device_uvector<size_t>>>       // label offsets to the renumber map
 renumber_and_compress_sampled_edgelist(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& edgelist_srcs,
@@ -1307,8 +1669,8 @@ renumber_and_compress_sampled_edgelist(
                          compressed_offsets.end(),
                          compressed_offsets.begin());
 
-  // 5. update compressed_offsets to include zero degree vertices (if doubly_compress is false) and
-  // compressed_offset_label_hop_offsets (if edgelist_label_offsets.has_value() or
+  // 5. update compressed_offsets to include zero degree vertices (if doubly_compress is false)
+  // and compressed_offset_label_hop_offsets (if edgelist_label_offsets.has_value() or
   // edgelist_hops.has_value() is true)
 
   std::optional<rmm::device_uvector<size_t>> compressed_offset_label_hop_offsets{std::nullopt};

From 18e199e4e2dedc9b63a0f2053a86514a2512538f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 9 Apr 2024 10:05:28 -0700
Subject: [PATCH 05/80] update tests

---
 .../sampling_post_processing_impl.cuh         |  10 +-
 .../sampling/sampling_post_processing_test.cu | 414 +++++++++++++-----
 2 files changed, 309 insertions(+), 115 deletions(-)

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index ed8d9ba0cf1..d05a25bd8c7 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -59,7 +59,6 @@ struct edge_order_t {
   __device__ bool operator()(size_t l_idx, size_t r_idx) const
   {
     if (edgelist_label_offsets) {
-      // FIXME: (*edgelist_label_offsets)[0] == always 0???
       auto l_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
                                       thrust::upper_bound(thrust::seq,
                                                           (*edgelist_label_offsets).begin() + 1,
@@ -92,7 +91,7 @@ struct edge_order_t {
 };
 
 template <typename vertex_t>
-struct is_first_in_run_t {
+struct is_first_triplet_in_run_t {
   thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
   thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
   raft::device_span<vertex_t const> edgelist_majors{};
@@ -885,7 +884,7 @@ compute_renumber_map(raft::handle_t const& handle,
       handle,
       edgelist_minors,
       edgelist_hops,
-      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      std::nullopt,
       std::nullopt,
       edgelist_label_offsets);
 
@@ -1540,6 +1539,7 @@ renumber_and_compress_sampled_edgelist(
       auto output_key_first =
         thrust::make_zip_iterator(unique_key_label_indices.begin(), unique_key_hops.begin());
 
+      // FIXME: should I consider seed_vertices in computing min,max?
       auto output_it =
         thrust::reduce_by_key(handle.get_thrust_policy(),
                               input_key_first,
@@ -1581,7 +1581,7 @@ renumber_and_compress_sampled_edgelist(
         CUGRAPH_EXPECTS(num_invalids == 0,
                         "Invalid input arguments: if @p compress_per_hop is false and @p "
                         "edgelist_hops.has_value() is true, the minimum majors with hop N + 1 "
-                        "should be larger than the maximum majors with hop N after renumbering.");
+                        "should be larger than the maximum majors with hop N after renumbering.");  // FIXME: re-phrase to input requirements?
       }
     }
   }
@@ -1593,7 +1593,7 @@ renumber_and_compress_sampled_edgelist(
     handle.get_thrust_policy(),
     thrust::make_counting_iterator(size_t{0}),
     thrust::make_counting_iterator(edgelist_majors.size()),
-    is_first_in_run_t<vertex_t>{
+    is_first_triplet_in_run_t<vertex_t>{
       detail::to_thrust_optional(edgelist_label_offsets),
       edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
                         (*edgelist_hops).data(), (*edgelist_hops).size())
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index 3e5b0a09179..c87cc5b960b 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -47,6 +47,7 @@ struct SamplingPostProcessing_Usecase {
   bool sample_with_replacement{false};
 
   bool src_is_major{true};
+  bool renumber_with_seeds{false};
   bool compress_per_hop{false};
   bool doubly_compress{false};
   bool check_correctness{true};
@@ -175,6 +176,7 @@ bool compare_edgelist(raft::handle_t const& handle,
 template <typename vertex_t>
 bool check_renumber_map_invariants(
   raft::handle_t const& handle,
+  std::optional<raft::device_span<vertex_t const>> starting_vertices,
   raft::device_span<vertex_t const> org_edgelist_srcs,
   raft::device_span<vertex_t const> org_edgelist_dsts,
   std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
@@ -193,6 +195,15 @@ bool check_renumber_map_invariants(
                org_edgelist_majors.begin(),
                org_edgelist_majors.end(),
                unique_majors.begin());
+  if (starting_vertices) {
+    auto old_size = unique_majors.size();
+    unique_majors.resize(old_size + (*starting_vertices).size(), handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*starting_vertices).begin(),
+                 (*starting_vertices).end(),
+                 unique_majors.begin() + old_size);
+  }
+
   std::optional<rmm::device_uvector<int32_t>> unique_major_hops =
     org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
                           (*org_edgelist_hops).size(), handle.get_stream())
@@ -202,6 +213,14 @@ bool check_renumber_map_invariants(
                  (*org_edgelist_hops).begin(),
                  (*org_edgelist_hops).end(),
                  (*unique_major_hops).begin());
+    if (starting_vertices) {
+      auto old_size = (*unique_major_hops).size();
+      (*unique_major_hops).resize(old_size + (*starting_vertices).size(), handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   (*unique_major_hops).begin() + old_size,
+                   (*unique_major_hops).end(),
+                   int32_t{0});
+    }
 
     auto pair_first =
       thrust::make_zip_iterator(unique_majors.begin(), (*unique_major_hops).begin());
@@ -476,6 +495,11 @@ class Tests_SamplingPostProcessing
                                     ? std::make_optional<rmm::device_uvector<label_t>>(
                                         starting_vertices.size(), handle.get_stream())
                                     : std::nullopt;
+    auto starting_vertex_label_offsets =
+      (sampling_post_processing_usecase.num_labels > 1)
+        ? std::make_optional<rmm::device_uvector<size_t>>(
+            sampling_post_processing_usecase.num_labels + 1, handle.get_stream())
+        : std::nullopt;
     if (starting_vertex_labels) {
       thrust::tabulate(
         handle.get_thrust_policy(),
@@ -483,6 +507,12 @@ class Tests_SamplingPostProcessing
         (*starting_vertex_labels).end(),
         [num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label] __device__(
           size_t i) { return static_cast<label_t>(i / num_seeds_per_label); });
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        (*starting_vertex_label_offsets).begin(),
+        (*starting_vertex_label_offsets).end(),
+        [num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label] __device__(
+          size_t i) { return num_seeds_per_label * i; });
     }
 
     rmm::device_uvector<vertex_t> org_edgelist_srcs(0, handle.get_stream());
@@ -530,10 +560,6 @@ class Tests_SamplingPostProcessing
       std::swap(org_edgelist_srcs, org_edgelist_dsts);
     }
 
-    starting_vertices.resize(0, handle.get_stream());
-    starting_vertices.shrink_to_fit(handle.get_stream());
-    starting_vertex_labels = std::nullopt;
-
     {
       rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_srcs(org_edgelist_srcs.size(),
                                                                         handle.get_stream());
@@ -600,8 +626,14 @@ class Tests_SamplingPostProcessing
           std::move(renumbered_and_sorted_edgelist_edge_ids),
           std::move(renumbered_and_sorted_edgelist_edge_types),
           std::move(renumbered_and_sorted_edgelist_hops),
-          std::nullopt,
-          std::nullopt,
+          sampling_post_processing_usecase.renumber_with_seeds
+            ? std::make_optional<raft::device_span<vertex_t const>>(starting_vertices.data(),
+                                                                    starting_vertices.size())
+            : std::nullopt,
+          (sampling_post_processing_usecase.renumber_with_seeds && starting_vertex_label_offsets)
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*starting_vertex_label_offsets).data(), (*starting_vertex_label_offsets).size())
+            : std::nullopt,
           org_edgelist_label_offsets
             ? std::make_optional(raft::device_span<size_t const>(
                 (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
@@ -658,6 +690,15 @@ class Tests_SamplingPostProcessing
         }
 
         for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t starting_vertex_start_offset =
+            starting_vertex_label_offsets
+              ? (*starting_vertex_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t starting_vertex_end_offset =
+            starting_vertex_label_offsets
+              ? (*starting_vertex_label_offsets).element(i + 1, handle.get_stream())
+              : starting_vertices.size();
+
           size_t edgelist_start_offset =
             org_edgelist_label_offsets
               ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
@@ -668,6 +709,10 @@ class Tests_SamplingPostProcessing
               : org_edgelist_srcs.size();
           if (edgelist_start_offset == edgelist_end_offset) continue;
 
+          auto this_label_starting_vertices = raft::device_span<vertex_t const>(
+            starting_vertices.data() + starting_vertex_start_offset,
+            starting_vertex_end_offset - starting_vertex_start_offset);
+
           auto this_label_org_edgelist_srcs =
             raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
                                               edgelist_end_offset - edgelist_start_offset);
@@ -761,12 +806,17 @@ class Tests_SamplingPostProcessing
 
           // Check the invariants in renumber_map
 
-          ASSERT_TRUE(check_renumber_map_invariants(handle,
-                                                    this_label_org_edgelist_srcs,
-                                                    this_label_org_edgelist_dsts,
-                                                    this_label_org_edgelist_hops,
-                                                    this_label_output_renumber_map,
-                                                    sampling_post_processing_usecase.src_is_major))
+          ASSERT_TRUE(check_renumber_map_invariants(
+            handle,
+            sampling_post_processing_usecase.renumber_with_seeds
+              ? std::make_optional<raft::device_span<vertex_t const>>(
+                  this_label_starting_vertices.data(), this_label_starting_vertices.size())
+              : std::nullopt,
+            this_label_org_edgelist_srcs,
+            this_label_org_edgelist_dsts,
+            this_label_org_edgelist_hops,
+            this_label_output_renumber_map,
+            sampling_post_processing_usecase.src_is_major))
             << "Renumbered and sorted output renumber map violates invariants.";
         }
       }
@@ -844,8 +894,14 @@ class Tests_SamplingPostProcessing
           std::move(renumbered_and_compressed_edgelist_edge_ids),
           std::move(renumbered_and_compressed_edgelist_edge_types),
           std::move(renumbered_and_compressed_edgelist_hops),
-          std::nullopt,
-          std::nullopt,
+          sampling_post_processing_usecase.renumber_with_seeds
+            ? std::make_optional<raft::device_span<vertex_t const>>(starting_vertices.data(),
+                                                                    starting_vertices.size())
+            : std::nullopt,
+          (sampling_post_processing_usecase.renumber_with_seeds && starting_vertex_label_offsets)
+            ? std::make_optional<raft::device_span<size_t const>>(
+                (*starting_vertex_label_offsets).data(), (*starting_vertex_label_offsets).size())
+            : std::nullopt,
           org_edgelist_label_offsets
             ? std::make_optional(raft::device_span<size_t const>(
                 (*org_edgelist_label_offsets).data(), (*org_edgelist_label_offsets).size()))
@@ -926,6 +982,15 @@ class Tests_SamplingPostProcessing
         }
 
         for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t starting_vertex_start_offset =
+            starting_vertex_label_offsets
+              ? (*starting_vertex_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t starting_vertex_end_offset =
+            starting_vertex_label_offsets
+              ? (*starting_vertex_label_offsets).element(i + 1, handle.get_stream())
+              : starting_vertices.size();
+
           size_t edgelist_start_offset =
             org_edgelist_label_offsets
               ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
@@ -936,6 +1001,10 @@ class Tests_SamplingPostProcessing
               : org_edgelist_srcs.size();
           if (edgelist_start_offset == edgelist_end_offset) continue;
 
+          auto this_label_starting_vertices = raft::device_span<vertex_t const>(
+            starting_vertices.data() + starting_vertex_start_offset,
+            starting_vertex_end_offset - starting_vertex_start_offset);
+
           auto this_label_org_edgelist_srcs =
             raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
                                               edgelist_end_offset - edgelist_start_offset);
@@ -1084,12 +1153,17 @@ class Tests_SamplingPostProcessing
 
           // Check the invariants in renumber_map
 
-          ASSERT_TRUE(check_renumber_map_invariants(handle,
-                                                    this_label_org_edgelist_srcs,
-                                                    this_label_org_edgelist_dsts,
-                                                    this_label_org_edgelist_hops,
-                                                    this_label_output_renumber_map,
-                                                    sampling_post_processing_usecase.src_is_major))
+          ASSERT_TRUE(check_renumber_map_invariants(
+            handle,
+            sampling_post_processing_usecase.renumber_with_seeds
+              ? std::make_optional<raft::device_span<vertex_t const>>(
+                  this_label_starting_vertices.data(), this_label_starting_vertices.size())
+              : std::nullopt,
+            this_label_org_edgelist_srcs,
+            this_label_org_edgelist_dsts,
+            this_label_org_edgelist_hops,
+            this_label_output_renumber_map,
+            sampling_post_processing_usecase.src_is_major))
             << "Renumbered and sorted output renumber map violates invariants.";
         }
       }
@@ -1311,47 +1385,86 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // enable correctness checks
     ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, true, true},
-      SamplingPostProcessing_Usecase{1, 4, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, false, false},
       SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, false, false},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, false, false},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, false, false},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, true, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, true}),
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, true, false}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
@@ -1361,46 +1474,86 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // enable correctness checks
     ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, false},
       SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, true, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, true, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, true, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, true, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, true, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, true, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, true, true},
-      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, false},
       SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, true, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, true, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, true, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, true, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, true, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, true, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, false, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, true, true},
-      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, true, false, true}),
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1409,46 +1562,87 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // enable correctness checks
     ::testing::Values(
-      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, true, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, true, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, true, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, false, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, true, false},
-      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, true, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, true, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, true, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, true, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, false, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, true, false},
-      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, true, false, false}),
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{
+        256, 64, {5, 10, 15}, false, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, true, true, false, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()

From fd6dd23e1e7282f7f25148718bfed760a3ef2d6e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 9 Apr 2024 10:05:41 -0700
Subject: [PATCH 06/80] update documentation

---
 cpp/include/cugraph/sampling_functions.hpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index 020e5a7f9b3..0c4b7c3b977 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -38,9 +38,12 @@ namespace cugraph {
  * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
  * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
  * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
- * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs. If
+ * @p seed_vertices.has-value() is true, we assume (hop=0, flag=major) for every vertex in @p
+ * *seed_vertices in renumbering (this is relevant when there are seed vertices with no neighbors).
  * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
- * appear only in edge minors.
+ * appear only in edge minors. If @p seed_vertices.has_value() is true, vertices in @p
+ * *seed_vertices precede vertex IDs that appear only in edge minors as well.
  * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
  * renumbered separately.
  *
@@ -160,9 +163,12 @@ renumber_and_compress_sampled_edgelist(
  * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
  * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
  * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
- * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs. If
+ * @p seed_vertices.has-value() is true, we assume (hop=0, flag=major) for every vertex in @p
+ * *seed_vertices in renumbering (this is relevant when there are seed vertices with no neighbors).
  * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
- * appear only in edge minors.
+ * appear only in edge minors. If @p seed_vertices.has_value() is true, vertices in @p
+ * *seed_vertices precede vertex IDs that appear only in edge minors as well.
  * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
  * renumbered separately.
  *

From d1bdcbb927e06ad16e915c7b17e442efc4a9318c Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 9 Apr 2024 10:13:29 -0700
Subject: [PATCH 07/80] clagn-format

---
 .../sampling_post_processing_impl.cuh         | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index d05a25bd8c7..4f857bc77fb 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -881,12 +881,7 @@ compute_renumber_map(raft::handle_t const& handle,
         unique_label_minor_pair_hops,
         unique_label_minor_pair_label_offsets] =
     compute_min_hop_for_unique_label_vertex_pairs<vertex_t, label_index_t>(
-      handle,
-      edgelist_minors,
-      edgelist_hops,
-      std::nullopt,
-      std::nullopt,
-      edgelist_label_offsets);
+      handle, edgelist_minors, edgelist_hops, std::nullopt, std::nullopt, edgelist_label_offsets);
 
   if (edgelist_label_offsets) {
     auto num_labels = (*edgelist_label_offsets).size() - 1;
@@ -1578,10 +1573,14 @@ renumber_and_compress_sampled_edgelist(
               return false;
             }
           });
-        CUGRAPH_EXPECTS(num_invalids == 0,
-                        "Invalid input arguments: if @p compress_per_hop is false and @p "
-                        "edgelist_hops.has_value() is true, the minimum majors with hop N + 1 "
-                        "should be larger than the maximum majors with hop N after renumbering.");  // FIXME: re-phrase to input requirements?
+        CUGRAPH_EXPECTS(
+          num_invalids == 0,
+          "Invalid input arguments: if @p compress_per_hop is false and @p "
+          "edgelist_hops.has_value() is true, the minimum majors with hop N + 1 "
+          "should be larger than the maximum majors with hop N after renumbering.");  // FIXME:
+                                                                                      // re-phrase
+                                                                                      // to input
+                                                                                      // requirements?
       }
     }
   }

From 586899d9ca8a81154d85a78be3f8381d1175c92a Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 9 Apr 2024 19:04:35 -0700
Subject: [PATCH 08/80] improve documentation & input argument checking

---
 cpp/include/cugraph/sampling_functions.hpp    |  18 +-
 .../sampling_post_processing_impl.cuh         | 238 ++++++++++++++----
 2 files changed, 192 insertions(+), 64 deletions(-)

diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index 0c4b7c3b977..a4d7a162a90 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -39,7 +39,7 @@ namespace cugraph {
  * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
  * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
  * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs. If
- * @p seed_vertices.has-value() is true, we assume (hop=0, flag=major) for every vertex in @p
+ * @p seed_vertices.has_value() is true, we assume (hop=0, flag=major) for every vertex in @p
  * *seed_vertices in renumbering (this is relevant when there are seed vertices with no neighbors).
  * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
  * appear only in edge minors. If @p seed_vertices.has_value() is true, vertices in @p
@@ -57,9 +57,10 @@ namespace cugraph {
  * (if @p src_is_major is true) or DCSC (if @p src_is_major is false). If @p doubly_compress is
  * false, the CSR/CSC offset array size is the number of vertices (which is the maximum vertex ID +
  * 1) + 1. Here, the maximum vertex ID is the maximum major vertex ID in the edges to compress if @p
- * compress_per_hop is false or for hop 0. If @p compress_per_hop is true and hop number is 1 or
- * larger, the maximum vertex ID is the larger of the maximum major vertex ID for this hop and the
- * maximum vertex ID for the edges in the previous hops.
+ * compress_per_hop is false or for hop 0 (@p seed_vertices should be included if valid). If @p
+ * compress_per_hop is true and hop number is 1 or larger, the maximum vertex ID is the larger of
+ * the maximum major vertex ID for this hop and the maximum vertex ID for the edges in the previous
+ * hops.
  *
  * If both @p compress_per_hop is false and @p edgelist_hops.has_value() is true, majors should be
  * non-decreasing within each label after renumbering and sorting by (hop, major, minor). Also,
@@ -86,8 +87,7 @@ namespace cugraph {
  * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
  * edgelist_srcs.size() if valid).
  * @param edgelist_hops An optional vector storing edge list hop numbers (size = @p
- * edgelist_srcs.size() if valid). @p edgelist_hops should be valid if @p num_hops >= 2. The hop
- * vector values should be non-decreasing within each label.
+ * edgelist_srcs.size() if valid). @p edgelist_hops should be valid if @p num_hops >= 2.
  * @param seed_vertices An optional pointer to the array storing seed vertices in hop 0.
  * @param seed_vertex_label_offsets An optional pointer to the array storing label offsets to the
  * seed vertices (size = @p num_labels + 1). @p seed_vertex_label_offsets should be valid if @p
@@ -197,8 +197,7 @@ renumber_and_compress_sampled_edgelist(
  * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
  * edgelist_srcs.size() if valid).
  * @param edgelist_hops An optional vector storing edge list hop numbers (size = @p
- * edgelist_srcs.size() if valid). @p edgelist_hops should be valid if @p num_hops >= 2. The hop
- * vector values should be non-decreasing within each label.
+ * edgelist_srcs.size() if valid). @p edgelist_hops should be valid if @p num_hops >= 2.
  * @param seed_vertices An optional pointer to the array storing seed vertices in hop 0.
  * @param seed_vertex_label_offsets An optional pointer to the array storing label offsets to the
  * seed vertices (size = @p num_labels + 1). @p seed_vertex_label_offsets should be valid if @p
@@ -279,8 +278,7 @@ renumber_and_sort_sampled_edgelist(
  * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
  * edgelist_srcs.size() if valid).
  * @param edgelist_hops An optional vector storing edge list hop numbers (size = @p
- * edgelist_srcs.size() if valid). @p edgelist_hops must be valid if @p num_hops >= 2. The hop
- * vector values should be non-decreasing within each label.
+ * edgelist_srcs.size() if valid). @p edgelist_hops must be valid if @p num_hops >= 2.
  * @param edgelist_label_offsets An optional pointer to the array storing label offsets to the input
  * edges (size = @p num_labels + 1). @p edgelist_label_offsets must be valid if @p num_labels >= 2.
  * @param num_labels Number of labels. Labels are considered if @p num_labels >=2 and ignored if @p
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 4f857bc77fb..27d393d9b9c 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -220,8 +220,6 @@ void check_input_edges(raft::handle_t const& handle,
                   "Invalid input arguments: if edgelist_hops is valid, (*edgelist_hops).size() and "
                   "edgelist_(srcs|dsts).size() should coincide.");
 
-  // FIXME: should check edgelist_hops elements are non-decreasing within each label? Or this
-  // requirement is not necessary?
   if (do_expensive_check) {
     if (edgelist_label_offsets) {
       CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),
@@ -288,7 +286,7 @@ void check_input_edges(raft::handle_t const& handle,
               &end_offset, (*edgelist_label_offsets).data() + (i + 1), 1, handle.get_stream());
             handle.sync_stream();
           }
-
+          this_label_zero_hop_majors.resize(end_offset - start_offset, handle.get_stream());
           if (edgelist_hops) {
             this_label_zero_hop_majors.resize(
               thrust::distance(this_label_zero_hop_majors.begin(),
@@ -1093,15 +1091,16 @@ compute_renumber_map(raft::handle_t const& handle,
 // this function does not reorder edges (the i'th returned edge is the renumbered output of the
 // i'th input edge)
 template <typename vertex_t, typename label_index_t>
-std::tuple<rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<size_t>>>
+std::tuple<rmm::device_uvector<vertex_t>,                 // edgelist_majors
+           rmm::device_uvector<vertex_t>,                 // edgelist minors
+           std::optional<rmm::device_uvector<vertex_t>>,  // seed_vertices,
+           rmm::device_uvector<vertex_t>,                 // renumber_map
+           std::optional<rmm::device_uvector<size_t>>>    // renumber_map_label_offsets
 renumber_sampled_edgelist(raft::handle_t const& handle,
                           rmm::device_uvector<vertex_t>&& edgelist_majors,
                           rmm::device_uvector<vertex_t>&& edgelist_minors,
-                          std::optional<raft::device_span<int32_t const>>&& edgelist_hops,
-                          std::optional<raft::device_span<vertex_t const>> seed_vertices,
+                          std::optional<raft::device_span<int32_t const>> edgelist_hops,
+                          std::optional<rmm::device_uvector<vertex_t>>&& seed_vertices,
                           std::optional<raft::device_span<size_t const>> seed_vertex_label_offsets,
                           std::optional<raft::device_span<size_t const>> edgelist_label_offsets,
                           size_t num_labels,
@@ -1114,7 +1113,9 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
     raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
     raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
     edgelist_hops,
-    seed_vertices,
+    seed_vertices ? std::make_optional<raft::device_span<vertex_t const>>((*seed_vertices).data(),
+                                                                          (*seed_vertices).size())
+                  : std::nullopt,
     seed_vertex_label_offsets,
     edgelist_label_offsets);
 
@@ -1294,6 +1295,40 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
         assert(*it == old_vertex);
         return new_vertices[thrust::distance(old_vertices.begin(), it)];
       });
+
+    if (seed_vertices) {
+      pair_first = thrust::make_zip_iterator((*seed_vertices).begin(),
+                                             thrust::make_counting_iterator(size_t{0}));
+      thrust::transform(
+        handle.get_thrust_policy(),
+        pair_first,
+        pair_first + (*seed_vertices).size(),
+        (*seed_vertices).begin(),
+        [seed_vertex_label_offsets  = *seed_vertex_label_offsets,
+         renumber_map_label_offsets = raft::device_span<size_t const>(
+           (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
+         old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
+                                                          segment_sorted_renumber_map.size()),
+         new_vertices = raft::device_span<vertex_t const>(
+           segment_sorted_new_vertices.data(),
+           segment_sorted_new_vertices.size())] __device__(auto pair) {
+          auto old_vertex = thrust::get<0>(pair);
+          auto label_idx  = static_cast<label_index_t>(
+            thrust::distance(seed_vertex_label_offsets.begin() + 1,
+                             thrust::upper_bound(thrust::seq,
+                                                 seed_vertex_label_offsets.begin() + 1,
+                                                 seed_vertex_label_offsets.end(),
+                                                 thrust::get<1>(pair))));
+          auto label_start_offset = renumber_map_label_offsets[label_idx];
+          auto label_end_offset   = renumber_map_label_offsets[label_idx + 1];
+          auto it                 = thrust::lower_bound(thrust::seq,
+                                        old_vertices.begin() + label_start_offset,
+                                        old_vertices.begin() + label_end_offset,
+                                        old_vertex);
+          assert(*it == old_vertex);
+          return new_vertices[thrust::distance(old_vertices.begin(), it)];
+        });
+    }
   } else {
     kv_store_t<vertex_t, vertex_t, false> kv_store(renumber_map.begin(),
                                                    renumber_map.end(),
@@ -1307,10 +1342,18 @@ renumber_sampled_edgelist(raft::handle_t const& handle,
       edgelist_majors.begin(), edgelist_majors.end(), edgelist_majors.begin(), handle.get_stream());
     kv_store_view.find(
       edgelist_minors.begin(), edgelist_minors.end(), edgelist_minors.begin(), handle.get_stream());
+
+    if (seed_vertices) {
+      kv_store_view.find((*seed_vertices).begin(),
+                         (*seed_vertices).end(),
+                         (*seed_vertices).begin(),
+                         handle.get_stream());
+    }
   }
 
   return std::make_tuple(std::move(edgelist_majors),
                          std::move(edgelist_minors),
+                         std::move(seed_vertices),
                          std::move(renumber_map),
                          std::move(renumber_map_label_offsets));
 }
@@ -1485,9 +1528,22 @@ renumber_and_compress_sampled_edgelist(
 
   // 2. renumber
 
+  std::optional<rmm::device_uvector<vertex_t>> renumbered_seed_vertices{std::nullopt};
+  if (seed_vertices) {
+    renumbered_seed_vertices =
+      rmm::device_uvector<vertex_t>((*seed_vertices).size(), handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*seed_vertices).begin(),
+                 (*seed_vertices).end(),
+                 (*renumbered_seed_vertices).begin());
+  }
   rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
   std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
-  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           renumbered_seed_vertices,
+           renumber_map,
+           renumber_map_label_offsets) =
     renumber_sampled_edgelist<vertex_t, label_index_t>(
       handle,
       std::move(edgelist_majors),
@@ -1495,7 +1551,7 @@ renumber_and_compress_sampled_edgelist(
       edgelist_hops ? std::make_optional(raft::device_span<int32_t const>((*edgelist_hops).data(),
                                                                           (*edgelist_hops).size()))
                     : std::nullopt,
-      seed_vertices,
+      std::move(renumbered_seed_vertices),
       seed_vertex_label_offsets,
       edgelist_label_offsets,
       num_labels,
@@ -1517,6 +1573,20 @@ renumber_and_compress_sampled_edgelist(
                                                      std::move(edgelist_hops),
                                                      edgelist_label_offsets);
 
+  if (renumbered_seed_vertices) {
+    if (seed_vertex_label_offsets) {
+      auto label_indices = detail::expand_sparse_offsets(
+        *seed_vertex_label_offsets, label_index_t{0}, handle.get_stream());
+      auto pair_first =
+        thrust::make_zip_iterator(label_indices.begin(), (*renumbered_seed_vertices).begin());
+      thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + label_indices.size());
+    } else {
+      thrust::sort(handle.get_thrust_policy(),
+                   (*renumbered_seed_vertices).begin(),
+                   (*renumbered_seed_vertices).end());
+    }
+  }
+
   if (do_expensive_check) {
     if (!compress_per_hop && edgelist_hops) {
       rmm::device_uvector<vertex_t> min_vertices(num_labels * num_hops, handle.get_stream());
@@ -1534,7 +1604,6 @@ renumber_and_compress_sampled_edgelist(
       auto output_key_first =
         thrust::make_zip_iterator(unique_key_label_indices.begin(), unique_key_hops.begin());
 
-      // FIXME: should I consider seed_vertices in computing min,max?
       auto output_it =
         thrust::reduce_by_key(handle.get_thrust_policy(),
                               input_key_first,
@@ -1554,6 +1623,34 @@ renumber_and_compress_sampled_edgelist(
                             max_vertices.begin(),
                             thrust::equal_to<thrust::tuple<label_index_t, int32_t>>{},
                             thrust::maximum<vertex_t>{});
+
+      if (renumbered_seed_vertices) {
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(num_labels),
+          [seed_vertices = raft::device_span<vertex_t const>((*renumbered_seed_vertices).data(),
+                                                             (*renumbered_seed_vertices).size()),
+           seed_vertex_label_offsets = detail::to_thrust_optional(seed_vertex_label_offsets),
+           num_hops,
+           min_vertices = raft::device_span<vertex_t>(min_vertices.data(), min_vertices.size()),
+           max_vertices = raft::device_span<vertex_t>(
+             max_vertices.data(), max_vertices.size())] __device__(size_t l_idx) {
+            size_t label_start_offset{0};
+            auto label_end_offset = seed_vertices.size();
+            if (seed_vertex_label_offsets) {
+              label_start_offset = (*seed_vertex_label_offsets)[l_idx];
+              label_end_offset   = (*seed_vertex_label_offsets)[l_idx + 1];
+            }
+            if (label_start_offset < label_end_offset) {
+              min_vertices[l_idx * num_hops] = cuda::std::min(min_vertices[l_idx * num_hops],
+                                                              seed_vertices[label_start_offset]);
+              max_vertices[l_idx * num_hops] = cuda::std::max(
+                max_vertices[l_idx * num_hops], seed_vertices[label_end_offset - 1]);
+            }
+          });
+      }
+
       if (num_unique_keys > 1) {
         auto num_invalids = thrust::count_if(
           handle.get_thrust_policy(),
@@ -1575,12 +1672,11 @@ renumber_and_compress_sampled_edgelist(
           });
         CUGRAPH_EXPECTS(
           num_invalids == 0,
-          "Invalid input arguments: if @p compress_per_hop is false and @p "
-          "edgelist_hops.has_value() is true, the minimum majors with hop N + 1 "
-          "should be larger than the maximum majors with hop N after renumbering.");  // FIXME:
-                                                                                      // re-phrase
-                                                                                      // to input
-                                                                                      // requirements?
+          "Invalid input arguments: if both compress_per_hop is false and "
+          "edgelist_hops.has_value() is true, majors should be non-decreasing within each label "
+          "after renumbering and sorting by (hop, major, minor). Also, majors in hop N should not "
+          "appear in any of the previous hops. This condition is satisfied if majors in hop N + 1 "
+          "does not have any vertices from the previous hops excluding the minors from hop N.");
       }
     }
   }
@@ -1728,45 +1824,69 @@ renumber_and_compress_sampled_edgelist(
                                               : thrust::nullopt,
        edgelist_majors =
          raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+       seed_vertices             = renumbered_seed_vertices
+                                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                             (*renumbered_seed_vertices).data(), (*renumbered_seed_vertices).size())
+                                     : thrust::nullopt,
+       seed_vertex_label_offsets = detail::to_thrust_optional(seed_vertex_label_offsets),
        num_hops,
        compress_per_hop] __device__(size_t i) {
-        size_t start_offset{0};
-        auto end_offset         = edgelist_majors.size();
-        auto label_start_offset = start_offset;
-        auto label_end_offset   = end_offset;
-
-        if (edgelist_label_offsets) {
-          auto l_idx         = static_cast<label_index_t>(i / num_hops);
-          start_offset       = (*edgelist_label_offsets)[l_idx];
-          end_offset         = (*edgelist_label_offsets)[l_idx + 1];
-          label_start_offset = start_offset;
-          label_end_offset   = end_offset;
-        }
+        vertex_t num_vertices_from_edgelist{0};
+        {
+          size_t start_offset{0};
+          auto end_offset         = edgelist_majors.size();
+          auto label_start_offset = start_offset;
+          auto label_end_offset   = end_offset;
 
-        if (num_hops > 1) {
-          auto h        = static_cast<int32_t>(i % num_hops);
-          auto lower_it = thrust::lower_bound(thrust::seq,
-                                              (*edgelist_hops).begin() + start_offset,
-                                              (*edgelist_hops).begin() + end_offset,
-                                              h);
-          auto upper_it = thrust::upper_bound(thrust::seq,
-                                              (*edgelist_hops).begin() + start_offset,
-                                              (*edgelist_hops).begin() + end_offset,
-                                              h);
-          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
-          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
-        }
-        if (compress_per_hop) {
-          return (start_offset < end_offset) ? (edgelist_majors[end_offset - 1] + 1) : vertex_t{0};
-        } else {
-          if (end_offset != label_end_offset) {
-            return edgelist_majors[end_offset];
-          } else if (label_start_offset < label_end_offset) {
-            return edgelist_majors[end_offset - 1] + 1;
+          if (edgelist_label_offsets) {
+            auto l_idx         = static_cast<label_index_t>(i / num_hops);
+            start_offset       = (*edgelist_label_offsets)[l_idx];
+            end_offset         = (*edgelist_label_offsets)[l_idx + 1];
+            label_start_offset = start_offset;
+            label_end_offset   = end_offset;
+          }
+
+          if (num_hops > 1) {
+            auto h        = static_cast<int32_t>(i % num_hops);
+            auto lower_it = thrust::lower_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            auto upper_it = thrust::upper_bound(thrust::seq,
+                                                (*edgelist_hops).begin() + start_offset,
+                                                (*edgelist_hops).begin() + end_offset,
+                                                h);
+            start_offset =
+              static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+            end_offset = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+          }
+          if (compress_per_hop) {
+            if (start_offset < end_offset)
+              num_vertices_from_edgelist = edgelist_majors[end_offset - 1] + 1;
           } else {
-            return vertex_t{0};
+            if (end_offset != label_end_offset) {
+              num_vertices_from_edgelist = edgelist_majors[end_offset];
+            } else if (label_start_offset < label_end_offset) {
+              num_vertices_from_edgelist = edgelist_majors[end_offset - 1] + 1;
+            }
           }
         }
+
+        vertex_t num_vertices_from_seed_vertices{0};
+        if (seed_vertices && (!compress_per_hop || (i % num_hops == 0))) {
+          size_t label_start_offset{0};
+          auto label_end_offset = (*seed_vertices).size();
+          if (seed_vertex_label_offsets) {
+            auto l_idx         = static_cast<label_index_t>(i / num_hops);
+            label_start_offset = (*seed_vertex_label_offsets)[l_idx];
+            label_end_offset   = (*seed_vertex_label_offsets)[l_idx + 1];
+          }
+          if (label_start_offset < label_end_offset) {
+            num_vertices_from_seed_vertices = (*seed_vertices)[label_end_offset - 1] + 1;
+          }
+        }
+
+        return cuda::std::max(num_vertices_from_edgelist, num_vertices_from_seed_vertices);
       });
 
     std::optional<rmm::device_uvector<vertex_t>> minor_vertex_counts{std::nullopt};
@@ -2035,9 +2155,19 @@ renumber_and_sort_sampled_edgelist(
 
   // 2. renumber
 
+  std::optional<rmm::device_uvector<vertex_t>> renumbered_seed_vertices{std::nullopt};
+  if (seed_vertices) {
+    renumbered_seed_vertices =
+      rmm::device_uvector<vertex_t>((*seed_vertices).size(), handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*seed_vertices).begin(),
+                 (*seed_vertices).end(),
+                 (*renumbered_seed_vertices).begin());
+  }
   rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
   std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
-  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+  std::tie(
+    edgelist_majors, edgelist_minors, std::ignore, renumber_map, renumber_map_label_offsets) =
     renumber_sampled_edgelist<vertex_t, label_index_t>(
       handle,
       std::move(edgelist_majors),
@@ -2045,7 +2175,7 @@ renumber_and_sort_sampled_edgelist(
       edgelist_hops ? std::make_optional(raft::device_span<int32_t const>((*edgelist_hops).data(),
                                                                           (*edgelist_hops).size()))
                     : std::nullopt,
-      seed_vertices,
+      std::move(renumbered_seed_vertices),
       seed_vertex_label_offsets,
       edgelist_label_offsets,
       num_labels,

From 633346b7a3ae421efcb85f8fdd49f166d8719639 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 9 Apr 2024 19:18:18 -0700
Subject: [PATCH 09/80] clang-format

---
 cpp/src/sampling/sampling_post_processing_impl.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 27d393d9b9c..c7af16aaf40 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -1643,10 +1643,10 @@ renumber_and_compress_sampled_edgelist(
               label_end_offset   = (*seed_vertex_label_offsets)[l_idx + 1];
             }
             if (label_start_offset < label_end_offset) {
-              min_vertices[l_idx * num_hops] = cuda::std::min(min_vertices[l_idx * num_hops],
-                                                              seed_vertices[label_start_offset]);
-              max_vertices[l_idx * num_hops] = cuda::std::max(
-                max_vertices[l_idx * num_hops], seed_vertices[label_end_offset - 1]);
+              min_vertices[l_idx * num_hops] =
+                cuda::std::min(min_vertices[l_idx * num_hops], seed_vertices[label_start_offset]);
+              max_vertices[l_idx * num_hops] =
+                cuda::std::max(max_vertices[l_idx * num_hops], seed_vertices[label_end_offset - 1]);
             }
           });
       }

From 5fb39a3fcc5ab0165fddb2248fb0080cbf5c4845 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 10 Apr 2024 09:14:14 -0700
Subject: [PATCH 10/80] bug fix in the C-API

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 162d2c6c675..14e20628c16 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -272,7 +272,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
               offsets ? std::make_optional(
                           raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
-              edge_label->size(),
+              edge_label ? edge_label->size() : size_t{1},
               fan_out_->size_,
               src_is_major,
               do_expensive_check_);
@@ -309,7 +309,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
               offsets ? std::make_optional(
                           raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
-              edge_label->size(),
+              edge_label ? edge_label->size() : size_t{1},
               fan_out_->size_,
               src_is_major,
               options_.compress_per_hop_,
@@ -340,7 +340,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                                            ? std::make_optional(raft::device_span<size_t const>{
                                                offsets->data(), offsets->size()})
                                            : std::nullopt,
-                                         edge_label->size(),
+                                         edge_label ? edge_label->size() : size_t{1},
                                          fan_out_->size_,
                                          src_is_major,
                                          do_expensive_check_);

From 2de1bad6c683ad5ff0054d40e0d0b382ffe82cee Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Wed, 10 Apr 2024 20:31:52 -0400
Subject: [PATCH 11/80] change interface to allow EdgeIndex

---
 .../cugraph-pyg/cugraph_pyg/nn/conv/base.py   | 57 +++++++++++--------
 .../cugraph_pyg/nn/conv/gat_conv.py           |  4 +-
 .../cugraph_pyg/nn/conv/gatv2_conv.py         |  4 +-
 .../cugraph_pyg/nn/conv/rgcn_conv.py          |  8 +--
 .../cugraph_pyg/nn/conv/sage_conv.py          |  6 +-
 .../cugraph_pyg/nn/conv/transformer_conv.py   |  6 +-
 6 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
index 10431a0398d..15f4a51a6d8 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,11 +15,16 @@
 from typing import Optional, Tuple, Union
 
 from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch import CSC, HeteroCSC
+import pylibcugraphops.pytorch
+
+# from pylibcugraphops.pytorch import CSC, HeteroCSC
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
 
+# A tuple of (row, colptr, num_src_nodes)
+CSC = Tuple[torch.Tensor, torch.Tensor, int]
+
 
 class BaseConv(torch.nn.Module):  # pragma: no cover
     r"""An abstract base class for implementing cugraph-ops message passing layers."""
@@ -33,10 +38,7 @@ def to_csc(
         edge_index: torch.Tensor,
         size: Optional[Tuple[int, int]] = None,
         edge_attr: Optional[torch.Tensor] = None,
-    ) -> Union[
-        Tuple[torch.Tensor, torch.Tensor, int],
-        Tuple[Tuple[torch.Tensor, torch.Tensor, int], torch.Tensor],
-    ]:
+    ) -> Union[CSC, Tuple[CSC, torch.Tensor],]:
         r"""Returns a CSC representation of an :obj:`edge_index` tensor to be
         used as input to cugraph-ops conv layers.
 
@@ -71,19 +73,17 @@ def to_csc(
 
     def get_cugraph(
         self,
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> CSC:
+    ) -> pylibcugraphops.pytorch.CSC:
         r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
         Supports both bipartite and non-bipartite graphs.
 
         Args:
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
+                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
+                CSC representation.
             bipartite (bool): If set to :obj:`True`, will create the bipartite
                 structure in cugraph-ops. (default: :obj:`False`)
             max_num_neighbors (int, optional): The maximum number of neighbors
@@ -91,7 +91,12 @@ def get_cugraph(
                 the message-flow-graph primitives in cugraph-ops.
                 (default: :obj:`None`)
         """
-        row, colptr, num_src_nodes = csc
+        if isinstance(edge_index, torch_geometric.EdgeIndex):
+            edge_index = edge_index.sort_by("col")[0]
+            num_src_nodes = edge_index.get_sparse_size(0)
+            (colptr, row), _ = edge_index.get_csc()
+        else:
+            row, colptr, num_src_nodes = edge_index
 
         if not row.is_cuda:
             raise RuntimeError(
@@ -102,7 +107,7 @@ def get_cugraph(
         if max_num_neighbors is None:
             max_num_neighbors = -1
 
-        return CSC(
+        return pylibcugraphops.pytorch.CSC(
             offsets=colptr,
             indices=row,
             num_src_nodes=num_src_nodes,
@@ -112,22 +117,20 @@ def get_cugraph(
 
     def get_typed_cugraph(
         self,
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_type: torch.Tensor,
         num_edge_types: Optional[int] = None,
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> HeteroCSC:
+    ) -> pylibcugraphops.pytorch.HeteroCSC:
         r"""Constructs a typed :obj:`cugraph` graph object from a CSC
         representation where each edge corresponds to a given edge type.
         Supports both bipartite and non-bipartite graphs.
 
         Args:
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
+                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
+                CSC representation.
             edge_type (torch.Tensor): The edge type.
             num_edge_types (int, optional): The maximum number of edge types.
                 When not given, will be computed on-the-fly, leading to
@@ -145,10 +148,16 @@ def get_typed_cugraph(
         if max_num_neighbors is None:
             max_num_neighbors = -1
 
-        row, colptr, num_src_nodes = csc
+        if isinstance(edge_index, torch_geometric.EdgeIndex):
+            edge_index, perm = edge_index.sort_by("col")
+            edge_type = edge_type[perm]
+            num_src_nodes = edge_index.get_sparse_size(0)
+            (colptr, row), _ = edge_index.get_csc()
+        else:
+            row, colptr, num_src_nodes = edge_index
         edge_type = edge_type.int()
 
-        return HeteroCSC(
+        return pylibcugraphops.pytorch.HeteroCSC(
             offsets=colptr,
             indices=row,
             edge_types=edge_type,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
index d1785f2bef8..27af5e7257d 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import mha_gat_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
@@ -159,7 +159,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        csc: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
         max_num_neighbors: Optional[int] = None,
         deterministic_dgrad: bool = False,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
index 33865898816..e091f91cd2f 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
@@ -172,7 +172,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        csc: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
         deterministic_dgrad: bool = False,
         deterministic_wgrad: bool = False,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
index 683780b66eb..e4e89d9bc1d 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,12 +11,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple
+from typing import Optional, Union
 
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import agg_hg_basis_n2n_post
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
@@ -110,7 +110,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: torch.Tensor,
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        csc: Union[torch_geometric.EdgeIndex, CSC],
         edge_type: torch.Tensor,
         max_num_neighbors: Optional[int] = None,
     ) -> torch.Tensor:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
index 8e0c1027416..24d3c935db6 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import agg_concat_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
@@ -116,7 +116,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        csc: Union[torch_geometric.EdgeIndex, CSC],
         max_num_neighbors: Optional[int] = None,
     ) -> torch.Tensor:
         bipartite = isinstance(x, Tuple)
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
index 41c0b4b4090..b4fcb66cfef 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +16,7 @@
 from cugraph.utilities.utils import import_optional
 from pylibcugraphops.pytorch.operators import mha_simple_n2n
 
-from .base import BaseConv
+from .base import BaseConv, CSC
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
@@ -153,7 +153,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        csc: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         r"""Runs the forward pass of the module.

From 98faf07fbd9de2a19fdc49f4b23836b3c7f07d8c Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Wed, 10 Apr 2024 22:09:10 -0400
Subject: [PATCH 12/80] clean up

---
 python/cugraph-pyg/cugraph_pyg/nn/conv/base.py                 | 1 -
 .../cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py   | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
index 15f4a51a6d8..39e4b522253 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
@@ -17,7 +17,6 @@
 from cugraph.utilities.utils import import_optional
 import pylibcugraphops.pytorch
 
-# from pylibcugraphops.pytorch import CSC, HeteroCSC
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
index e1029849cb0..5b25e7dc334 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -71,9 +71,6 @@ def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
                 w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
                 if w_dst[edge_type] is not None:
                     w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
-            # w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
-            # if w_dst[edge_type] is not None:
-            #     w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
 
             conv2.attn_weights[edge_type][: heads * out_channels] = conv1.convs[
                 edge_type

From a703b39a0744544b5a8fd4e2ba5fa286ba4cfc48 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 11 Apr 2024 11:51:07 -0700
Subject: [PATCH 13/80] bug fix

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 14e20628c16..100e81a5bd2 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -273,7 +273,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                           raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
               edge_label ? edge_label->size() : size_t{1},
-              fan_out_->size_,
+              hop ? fan_out_->size_ : size_t{1},
               src_is_major,
               do_expensive_check_);
 
@@ -310,7 +310,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                           raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
               edge_label ? edge_label->size() : size_t{1},
-              fan_out_->size_,
+              hop ? fan_out_->size_ : size_t{1},
               src_is_major,
               options_.compress_per_hop_,
               doubly_compress,
@@ -341,7 +341,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                                                offsets->data(), offsets->size()})
                                            : std::nullopt,
                                          edge_label ? edge_label->size() : size_t{1},
-                                         fan_out_->size_,
+                                         hop ? fan_out_->size_ : size_t{1},
                                          src_is_major,
                                          do_expensive_check_);
 

From 7d270187725eead4f7439752a071c659b60e5bca Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 11 Apr 2024 16:13:19 -0700
Subject: [PATCH 14/80] CI experiment

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 100e81a5bd2..d2ae231010f 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -215,6 +215,17 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
           options_.prior_sources_behavior_,
           options_.dedupe_sources_,
           do_expensive_check_);
+#if 1  // DEBUG
+      if (edge_label) {
+        std::cout << "options_.renumber_results_ = " << options_.renumber_results_
+                  << " edge_label.has_value()= true, edge_label->size()=" << edge_label->size()
+                  << " label_list=" << label_list_ << " label_list->size_=" << label_list->size_
+                  << std::endl;
+      } else {
+        std::cout << "edge_label.has_value()= false" << options_.renumber_results_ =
+          " << options_.renumber_results_ << " std::endl;
+      }
+#endif
 
       std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
 

From 8b3a5bd5a87e487dc1019ca06dbab7695aa31dfb Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 12 Apr 2024 10:19:21 -0700
Subject: [PATCH 15/80] compile error fix

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index d2ae231010f..56d54939b5e 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -219,11 +219,11 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       if (edge_label) {
         std::cout << "options_.renumber_results_ = " << options_.renumber_results_
                   << " edge_label.has_value()= true, edge_label->size()=" << edge_label->size()
-                  << " label_list=" << label_list_ << " label_list->size_=" << label_list->size_
+                  << " label_list=" << label_list_ << " label_list->size_=" << label_list_->size_
                   << std::endl;
       } else {
-        std::cout << "edge_label.has_value()= false" << options_.renumber_results_ =
-          " << options_.renumber_results_ << " std::endl;
+        std::cout << "edge_label.has_value()= false, options_.renumber_results_="
+                  << options_.renumber_results_ << std::endl;
       }
 #endif
 

From a6896c10c890ddad324fc9d97c9cc1b6cecccb22 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 12 Apr 2024 14:55:36 -0700
Subject: [PATCH 16/80] fix

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 56d54939b5e..8e82cb74835 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -219,7 +219,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       if (edge_label) {
         std::cout << "options_.renumber_results_ = " << options_.renumber_results_
                   << " edge_label.has_value()= true, edge_label->size()=" << edge_label->size()
-                  << " label_list=" << label_list_ << " label_list->size_=" << label_list_->size_
                   << std::endl;
       } else {
         std::cout << "edge_label.has_value()= false, options_.renumber_results_="

From 51be4c68d6d125cf1b350e462ca47d8a27a9f91b Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Fri, 12 Apr 2024 23:57:03 -0400
Subject: [PATCH 17/80] return perm in helpers

---
 .../cugraph-pyg/cugraph_pyg/nn/conv/base.py   | 52 +++++++++++--------
 .../cugraph_pyg/nn/conv/gat_conv.py           | 17 +++---
 .../cugraph_pyg/nn/conv/gatv2_conv.py         | 12 ++---
 .../cugraph_pyg/nn/conv/rgcn_conv.py          |  9 ++--
 .../cugraph_pyg/nn/conv/sage_conv.py          |  8 +--
 .../cugraph_pyg/nn/conv/transformer_conv.py   | 12 ++---
 .../cugraph_pyg/tests/nn/test_gat_conv.py     | 37 ++++++++++---
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py   | 31 +++++++----
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py    | 24 +++++++--
 .../cugraph_pyg/tests/nn/test_sage_conv.py    | 20 +++++--
 .../tests/nn/test_transformer_conv.py         | 14 +++--
 11 files changed, 158 insertions(+), 78 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
index 39e4b522253..713448a8203 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
@@ -75,7 +75,7 @@ def get_cugraph(
         edge_index: Union[torch_geometric.EdgeIndex, CSC],
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> pylibcugraphops.pytorch.CSC:
+    ) -> Tuple[pylibcugraphops.pytorch.CSC, Optional[torch.Tensor]]:
         r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
         Supports both bipartite and non-bipartite graphs.
 
@@ -90,8 +90,9 @@ def get_cugraph(
                 the message-flow-graph primitives in cugraph-ops.
                 (default: :obj:`None`)
         """
+        perm = None
         if isinstance(edge_index, torch_geometric.EdgeIndex):
-            edge_index = edge_index.sort_by("col")[0]
+            edge_index, perm = edge_index.sort_by("col")
             num_src_nodes = edge_index.get_sparse_size(0)
             (colptr, row), _ = edge_index.get_csc()
         else:
@@ -106,12 +107,15 @@ def get_cugraph(
         if max_num_neighbors is None:
             max_num_neighbors = -1
 
-        return pylibcugraphops.pytorch.CSC(
-            offsets=colptr,
-            indices=row,
-            num_src_nodes=num_src_nodes,
-            dst_max_in_degree=max_num_neighbors,
-            is_bipartite=bipartite,
+        return (
+            pylibcugraphops.pytorch.CSC(
+                offsets=colptr,
+                indices=row,
+                num_src_nodes=num_src_nodes,
+                dst_max_in_degree=max_num_neighbors,
+                is_bipartite=bipartite,
+            ),
+            perm,
         )
 
     def get_typed_cugraph(
@@ -121,7 +125,7 @@ def get_typed_cugraph(
         num_edge_types: Optional[int] = None,
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> pylibcugraphops.pytorch.HeteroCSC:
+    ) -> Tuple[pylibcugraphops.pytorch.HeteroCSC, Optional[torch.Tensor]]:
         r"""Constructs a typed :obj:`cugraph` graph object from a CSC
         representation where each edge corresponds to a given edge type.
         Supports both bipartite and non-bipartite graphs.
@@ -147,6 +151,7 @@ def get_typed_cugraph(
         if max_num_neighbors is None:
             max_num_neighbors = -1
 
+        perm = None
         if isinstance(edge_index, torch_geometric.EdgeIndex):
             edge_index, perm = edge_index.sort_by("col")
             edge_type = edge_type[perm]
@@ -156,29 +161,30 @@ def get_typed_cugraph(
             row, colptr, num_src_nodes = edge_index
         edge_type = edge_type.int()
 
-        return pylibcugraphops.pytorch.HeteroCSC(
-            offsets=colptr,
-            indices=row,
-            edge_types=edge_type,
-            num_src_nodes=num_src_nodes,
-            num_edge_types=num_edge_types,
-            dst_max_in_degree=max_num_neighbors,
-            is_bipartite=bipartite,
+        return (
+            pylibcugraphops.pytorch.HeteroCSC(
+                offsets=colptr,
+                indices=row,
+                edge_types=edge_type,
+                num_src_nodes=num_src_nodes,
+                num_edge_types=num_edge_types,
+                dst_max_in_degree=max_num_neighbors,
+                is_bipartite=bipartite,
+            ),
+            perm,
         )
 
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
     ) -> torch.Tensor:
         r"""Runs the forward pass of the module.
 
         Args:
             x (torch.Tensor): The node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
+                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
+                CSC representation.
         """
         raise NotImplementedError
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
index 27af5e7257d..981b1c5b50d 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
@@ -159,7 +159,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Union[torch_geometric.EdgeIndex, CSC],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
         max_num_neighbors: Optional[int] = None,
         deterministic_dgrad: bool = False,
@@ -172,11 +172,7 @@ def forward(
         Args:
             x (torch.Tensor or tuple): The node features. Can be a tuple of
                 tensors denoting source and destination node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex or CSC): The edge indices.
             edge_attr: (torch.Tensor, optional) The edge features.
             max_num_neighbors (int, optional): The maximum number of neighbors
                 of a destination node. When enabled, it allows models to use
@@ -198,9 +194,12 @@ def forward(
                 the corresponding input type at the very end.
         """
         bipartite = not isinstance(x, torch.Tensor)
-        graph = self.get_cugraph(
-            csc, bipartite=bipartite, max_num_neighbors=max_num_neighbors
+        graph, perm = self.get_cugraph(
+            edge_index=edge_index,
+            bipartite=bipartite,
+            max_num_neighbors=max_num_neighbors,
         )
+
         if deterministic_dgrad:
             graph.add_reverse_graph()
 
@@ -212,6 +211,8 @@ def forward(
                 )
             if edge_attr.dim() == 1:
                 edge_attr = edge_attr.view(-1, 1)
+            if perm is not None:
+                edge_attr = edge_attr[perm]
             edge_attr = self.lin_edge(edge_attr)
 
         if bipartite:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
index e091f91cd2f..ebb30de9754 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
@@ -172,7 +172,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Union[torch_geometric.EdgeIndex, CSC],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
         deterministic_dgrad: bool = False,
         deterministic_wgrad: bool = False,
@@ -182,11 +182,7 @@ def forward(
         Args:
             x (torch.Tensor or tuple): The node features. Can be a tuple of
                 tensors denoting source and destination node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex or CSC): The edge indices.
             edge_attr: (torch.Tensor, optional) The edge features.
             deterministic_dgrad : bool, default=False
                 Optional flag indicating whether the feature gradients
@@ -196,7 +192,7 @@ def forward(
                 are computed deterministically using a dedicated workspace buffer.
         """
         bipartite = not isinstance(x, torch.Tensor) or not self.share_weights
-        graph = self.get_cugraph(csc, bipartite=bipartite)
+        graph, perm = self.get_cugraph(edge_index, bipartite=bipartite)
         if deterministic_dgrad:
             graph.add_reverse_graph()
 
@@ -208,6 +204,8 @@ def forward(
                 )
             if edge_attr.dim() == 1:
                 edge_attr = edge_attr.view(-1, 1)
+            if perm is not None:
+                edge_attr = edge_attr[perm]
             edge_attr = self.lin_edge(edge_attr)
 
         if bipartite:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
index e4e89d9bc1d..13fa08db5c5 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
@@ -110,13 +110,16 @@ def reset_parameters(self):
     def forward(
         self,
         x: torch.Tensor,
-        csc: Union[torch_geometric.EdgeIndex, CSC],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_type: torch.Tensor,
         max_num_neighbors: Optional[int] = None,
     ) -> torch.Tensor:
 
-        graph = self.get_typed_cugraph(
-            csc, edge_type, self.num_relations, max_num_neighbors=max_num_neighbors
+        graph, _ = self.get_typed_cugraph(
+            edge_index,
+            edge_type,
+            self.num_relations,
+            max_num_neighbors=max_num_neighbors,
         )
 
         out = agg_hg_basis_n2n_post(
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
index 24d3c935db6..65dc99d8988 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
@@ -116,12 +116,14 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Union[torch_geometric.EdgeIndex, CSC],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         max_num_neighbors: Optional[int] = None,
     ) -> torch.Tensor:
         bipartite = isinstance(x, Tuple)
-        graph = self.get_cugraph(
-            csc, bipartite=bipartite, max_num_neighbors=max_num_neighbors
+        graph, _ = self.get_cugraph(
+            edge_index=edge_index,
+            bipartite=bipartite,
+            max_num_neighbors=max_num_neighbors,
         )
 
         if self.project:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
index b4fcb66cfef..e184ee0e893 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
@@ -153,7 +153,7 @@ def reset_parameters(self):
     def forward(
         self,
         x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        csc: Union[torch_geometric.EdgeIndex, CSC],
+        edge_index: Union[torch_geometric.EdgeIndex, CSC],
         edge_attr: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         r"""Runs the forward pass of the module.
@@ -161,15 +161,11 @@ def forward(
         Args:
             x (torch.Tensor or tuple): The node features. Can be a tuple of
                 tensors denoting source and destination node features.
-            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
-                representation of a graph, given as a tuple of
-                :obj:`(row, colptr, num_src_nodes)`. Use the
-                :meth:`to_csc` method to convert an :obj:`edge_index`
-                representation to the desired format.
+            edge_index (EdgeIndex or CSC): The edge indices.
             edge_attr: (torch.Tensor, optional) The edge features.
         """
         bipartite = True
-        graph = self.get_cugraph(csc, bipartite=bipartite)
+        graph, perm = self.get_cugraph(edge_index=edge_index, bipartite=bipartite)
 
         if isinstance(x, torch.Tensor):
             x = (x, x)
@@ -184,6 +180,8 @@ def forward(
                     f"{self.__class__.__name__}.edge_dim must be set to accept "
                     f"edge features."
                 )
+            if perm is not None:
+                edge_attr = edge_attr[perm]
             edge_attr = self.lin_edge(edge_attr)
 
         out = mha_simple_n2n(
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index 3a34372cdce..5e0cac1cb3f 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -22,6 +22,7 @@
 @pytest.mark.skipif(
     package_available("torch_geometric<2.5"), reason="Test requires pyg>=2.5"
 )
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
@@ -30,9 +31,18 @@
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_gat_conv_equality(
-    bias, bipartite, concat, heads, max_num_neighbors, use_edge_attr, graph, request
+    use_edge_index,
+    bias,
+    bipartite,
+    concat,
+    heads,
+    max_num_neighbors,
+    use_edge_attr,
+    graph,
+    request,
 ):
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import GATConv
 
     torch.manual_seed(12345)
@@ -53,13 +63,19 @@ def test_gat_conv_equality(
     if use_edge_attr:
         edge_dim = 3
         edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-        csc, edge_attr_perm = CuGraphGATConv.to_csc(
-            edge_index, size, edge_attr=edge_attr
-        )
     else:
-        edge_dim = None
-        edge_attr = edge_attr_perm = None
-        csc = CuGraphGATConv.to_csc(edge_index, size)
+        edge_dim = edge_attr = None
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        if use_edge_attr:
+            csc, edge_attr_perm = CuGraphGATConv.to_csc(
+                edge_index, size, edge_attr=edge_attr
+            )
+        else:
+            csc = CuGraphGATConv.to_csc(edge_index, size)
+            edge_attr_perm = None
 
     kwargs = dict(bias=bias, concat=concat, edge_dim=edge_dim)
 
@@ -83,7 +99,12 @@ def test_gat_conv_equality(
             conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    out2 = conv2(x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_attr=edge_attr, max_num_neighbors=max_num_neighbors)
+    else:
+        out2 = conv2(
+            x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors
+        )
     assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_output = torch.rand_like(out1)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index a4794628410..120a0348317 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,14 +18,18 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, request):
+def test_gatv2_conv_equality(
+    use_edge_index, bipartite, concat, heads, use_edge_attr, graph, request
+):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import GATv2Conv
 
     torch.manual_seed(12345)
@@ -46,13 +50,19 @@ def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, req
     if use_edge_attr:
         edge_dim = 3
         edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-        csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
-            edge_index, size, edge_attr=edge_attr
-        )
     else:
-        edge_dim = None
-        edge_attr = edge_attr_perm = None
-        csc = CuGraphGATv2Conv.to_csc(edge_index, size)
+        edge_dim = edge_attr = None
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        if use_edge_attr:
+            csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
+                edge_index, size, edge_attr=edge_attr
+            )
+        else:
+            csc = CuGraphGATv2Conv.to_csc(edge_index, size)
+            edge_attr_perm = None
 
     kwargs = dict(bias=False, concat=concat, edge_dim=edge_dim)
 
@@ -69,7 +79,10 @@ def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, req
             conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    out2 = conv2(x, csc, edge_attr=edge_attr_perm)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_attr=edge_attr)
+    else:
+        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
     assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_output = torch.rand_like(out1)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
index ded4f300c0c..fedc7edabe9 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("aggr", ["add", "sum", "mean"])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("max_num_neighbors", [8, None])
@@ -25,10 +26,18 @@
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_rgcn_conv_equality(
-    aggr, bias, max_num_neighbors, num_bases, root_weight, graph, request
+    use_edge_index,
+    aggr,
+    bias,
+    max_num_neighbors,
+    num_bases,
+    root_weight,
+    graph,
+    request,
 ):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import FastRGCNConv as RGCNConv
 
     torch.manual_seed(12345)
@@ -39,8 +48,12 @@ def test_rgcn_conv_equality(
     edge_index = edge_index.cuda()
     edge_type = torch.randint(num_relations, (edge_index.size(1),)).cuda()
 
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
+
     x = torch.rand(size[0], in_channels, device="cuda")
-    csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
 
     conv1 = RGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
     conv2 = CuGraphRGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
@@ -55,7 +68,10 @@ def test_rgcn_conv_equality(
             conv2.comp.data = conv1.comp.data.detach().clone()
 
     out1 = conv1(x, edge_index, edge_type)
-    out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_type)
+    else:
+        out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
     assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
index b2977d1d175..62a32a603b1 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("aggr", ["sum", "mean", "min", "max"])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
@@ -26,16 +27,29 @@
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_sage_conv_equality(
-    aggr, bias, bipartite, max_num_neighbors, normalize, root_weight, graph, request
+    use_edge_index,
+    aggr,
+    bias,
+    bipartite,
+    max_num_neighbors,
+    normalize,
+    root_weight,
+    graph,
+    request,
 ):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import SAGEConv
 
     torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
-    csc = CuGraphSAGEConv.to_csc(edge_index, size)
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        csc = CuGraphSAGEConv.to_csc(edge_index, size)
 
     if bipartite:
         in_channels = (7, 3)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index fbdb244898b..85461cdeb38 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,19 +18,27 @@
 ATOL = 1e-6
 
 
+@pytest.mark.parametrize("use_edge_index", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
+def test_transformer_conv_equality(
+    use_edge_index, bipartite, concat, heads, graph, request
+):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
+    from torch_geometric import EdgeIndex
     from torch_geometric.nn import TransformerConv
 
     torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
-    csc = CuGraphTransformerConv.to_csc(edge_index, size)
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        csc = CuGraphTransformerConv.to_csc(edge_index, size)
 
     out_channels = 2
     kwargs = dict(concat=concat, bias=False, root_weight=False)

From 8d209d584aab79f09b1418c003156e2c85714ff3 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 15 Apr 2024 11:50:00 -0700
Subject: [PATCH 18/80] CI failure fix

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp   | 10 ---
 .../sampling_post_processing_impl.cuh         | 77 +++++++++++--------
 2 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 8e82cb74835..100e81a5bd2 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -215,16 +215,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
           options_.prior_sources_behavior_,
           options_.dedupe_sources_,
           do_expensive_check_);
-#if 1  // DEBUG
-      if (edge_label) {
-        std::cout << "options_.renumber_results_ = " << options_.renumber_results_
-                  << " edge_label.has_value()= true, edge_label->size()=" << edge_label->size()
-                  << std::endl;
-      } else {
-        std::cout << "edge_label.has_value()= false, options_.renumber_results_="
-                  << options_.renumber_results_ << std::endl;
-      }
-#endif
 
       std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
 
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index c7af16aaf40..b0b3bb5f4f2 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -168,39 +168,6 @@ void check_input_edges(raft::handle_t const& handle,
                        size_t num_hops,
                        bool do_expensive_check)
 {
-  CUGRAPH_EXPECTS((num_labels >= 1) && (num_labels <= std::numeric_limits<label_index_t>::max()),
-                  "Invalid input arguments: num_labels should be a positive integer and the "
-                  "current implementation assumes that the number of unique labels is no larger "
-                  "than std::numeric_limits<uint32_t>::max().");
-  CUGRAPH_EXPECTS((num_labels == 1) || edgelist_label_offsets.has_value(),
-                  "Invalid input arguments: edgelist_label_offsets.has_value() should be true if "
-                  "num_labels >= 2.");
-  CUGRAPH_EXPECTS(
-    !edgelist_label_offsets.has_value() || ((*edgelist_label_offsets).size() == num_labels + 1),
-    "Invalid input arguments: if edgelist_label_offsets is valid, (*edgelist_label_offsets).size() "
-    "(size of the offset array) should be num_labels + 1.");
-
-  CUGRAPH_EXPECTS(
-    (num_hops >= 1) && (num_hops <= std::numeric_limits<int32_t>::max()),
-    "Invalid input arguments: num_hops should be a positive integer and the current implementation "
-    "assumes that the number of hops is no larger than std::numeric_limits<int32_t>::max().");
-  CUGRAPH_EXPECTS(
-    (num_hops == 1) || edgelist_hops.has_value(),
-    "Invalid input arguments: edgelist_hops.has_value() should be true if num_hops >= 2.");
-
-  CUGRAPH_EXPECTS((!seed_vertices.has_value() && !seed_vertex_label_offsets.has_value()) ||
-                    (seed_vertices.has_value() &&
-                     (edgelist_label_offsets.has_value() == seed_vertex_label_offsets.has_value())),
-                  "Invaild input arguments: if seed_vertices.has_value() is false, "
-                  "seed_vertex_label_offsets.has_value() should be false as well. If "
-                  "seed_vertices.has_value( ) is true, seed_vertex_label_offsets.has_value() "
-                  "should coincide with edgelist_label_offsets.has_value().");
-  CUGRAPH_EXPECTS(
-    !seed_vertex_label_offsets.has_value() ||
-      ((*seed_vertex_label_offsets).size() == num_labels + 1),
-    "Invalid input arguments: if seed_vertex_label_offsets is valid, "
-    "(*seed_vertex_label_offsets).size() (size of the offset array) should be num_labels + 1.");
-
   CUGRAPH_EXPECTS(
     edgelist_majors.size() == edgelist_minors.size(),
     "Invalid input arguments: edgelist_srcs.size() and edgelist_dsts.size() should coincide.");
@@ -220,6 +187,50 @@ void check_input_edges(raft::handle_t const& handle,
                   "Invalid input arguments: if edgelist_hops is valid, (*edgelist_hops).size() and "
                   "edgelist_(srcs|dsts).size() should coincide.");
 
+  CUGRAPH_EXPECTS(
+    !edgelist_label_offsets.has_value() || ((*edgelist_label_offsets).size() == num_labels + 1),
+    "Invalid input arguments: if edgelist_label_offsets is valid, (*edgelist_label_offsets).size() "
+    "(size of the offset array) should be num_labels + 1.");
+
+  if (edgelist_majors.size() > 0) {
+    CUGRAPH_EXPECTS((num_labels >= 1) && (num_labels <= std::numeric_limits<label_index_t>::max()),
+                    "Invalid input arguments: num_labels should be a positive integer and the "
+                    "current implementation assumes that the number of unique labels is no larger "
+                    "than std::numeric_limits<uint32_t>::max().");
+    CUGRAPH_EXPECTS((num_labels == 1) || edgelist_label_offsets.has_value(),
+                    "Invalid input arguments: edgelist_label_offsets.has_value() should be true if "
+                    "num_labels >= 2.");
+
+    CUGRAPH_EXPECTS(
+      (num_hops >= 1) && (num_hops <= std::numeric_limits<int32_t>::max()),
+      "Invalid input arguments: num_hops should be a positive integer and the current "
+      "implementation "
+      "assumes that the number of hops is no larger than std::numeric_limits<int32_t>::max().");
+    CUGRAPH_EXPECTS(
+      (num_hops == 1) || edgelist_hops.has_value(),
+      "Invalid input arguments: edgelist_hops.has_value() should be true if num_hops >= 2.");
+  } else {
+    CUGRAPH_EXPECTS(
+      "num_labels == 0",
+      "Invalid input arguments: num_labels should be 0 if the input edge list is empty.");
+    CUGRAPH_EXPECTS(
+      "num_hops == 0",
+      "Invalid input arguments: num_hops should be 0 if the input edge list is empty.");
+  }
+
+  CUGRAPH_EXPECTS((!seed_vertices.has_value() && !seed_vertex_label_offsets.has_value()) ||
+                    (seed_vertices.has_value() &&
+                     (edgelist_label_offsets.has_value() == seed_vertex_label_offsets.has_value())),
+                  "Invaild input arguments: if seed_vertices.has_value() is false, "
+                  "seed_vertex_label_offsets.has_value() should be false as well. If "
+                  "seed_vertices.has_value( ) is true, seed_vertex_label_offsets.has_value() "
+                  "should coincide with edgelist_label_offsets.has_value().");
+  CUGRAPH_EXPECTS(
+    !seed_vertex_label_offsets.has_value() ||
+      ((*seed_vertex_label_offsets).size() == num_labels + 1),
+    "Invalid input arguments: if seed_vertex_label_offsets is valid, "
+    "(*seed_vertex_label_offsets).size() (size of the offset array) should be num_labels + 1.");
+
   if (do_expensive_check) {
     if (edgelist_label_offsets) {
       CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),

From 37f53af2ca9430ae10b5636188e8bd352cb04f58 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Mon, 15 Apr 2024 16:07:05 -0400
Subject: [PATCH 19/80] fix tests, use _copy in tests

---
 .../cugraph_pyg/nn/conv/hetero_gat_conv.py    |  4 +-
 .../cugraph_pyg/tests/nn/test_gat_conv.py     | 14 ++---
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py   |  8 +--
 .../tests/nn/test_hetero_gat_conv.py          | 20 +++----
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py    |  8 +--
 .../cugraph_pyg/tests/nn/test_sage_conv.py    |  6 +-
 .../tests/nn/test_transformer_conv.py         | 57 +++++++++++++------
 7 files changed, 70 insertions(+), 47 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
index 6b648c1b77a..a73dd8e57ff 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
@@ -226,7 +226,7 @@ def forward(
             )
 
             if src_type == dst_type:
-                graph = self.get_cugraph(
+                graph, _ = self.get_cugraph(
                     csc,
                     bipartite=False,
                 )
@@ -241,7 +241,7 @@ def forward(
                 )
 
             else:
-                graph = self.get_cugraph(
+                graph, _ = self.get_cugraph(
                     csc,
                     bipartite=True,
                 )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index 5e0cac1cb3f..a26063f62fa 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -87,16 +87,16 @@ def test_gat_conv_equality(
     out_dim = heads * out_channels
     with torch.no_grad():
         if bipartite:
-            conv2.lin_src.weight.data = conv1.lin_src.weight.data.detach().clone()
-            conv2.lin_dst.weight.data = conv1.lin_dst.weight.data.detach().clone()
+            conv2.lin_src.weight.copy_(conv1.lin_src.weight)
+            conv2.lin_dst.weight.copy_(conv1.lin_dst.weight)
         else:
-            conv2.lin.weight.data = conv1.lin.weight.data.detach().clone()
+            conv2.lin.weight.copy_(conv1.lin.weight)
 
-        conv2.att.data[:out_dim] = conv1.att_src.data.flatten()
-        conv2.att.data[out_dim : 2 * out_dim] = conv1.att_dst.data.flatten()
+        conv2.att[:out_dim].copy_(conv1.att_src.flatten())
+        conv2.att[out_dim : 2 * out_dim].copy_(conv1.att_dst.flatten())
         if use_edge_attr:
-            conv2.att.data[2 * out_dim :] = conv1.att_edge.data.flatten()
-            conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
+            conv2.att[2 * out_dim :].copy_(conv1.att_edge.flatten())
+            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
     if use_edge_index:
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index 120a0348317..a62f2fed2f7 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -72,11 +72,11 @@ def test_gatv2_conv_equality(
     conv2 = CuGraphGATv2Conv(in_channels, out_channels, heads, **kwargs).cuda()
 
     with torch.no_grad():
-        conv2.lin_src.weight.data = conv1.lin_l.weight.data.detach().clone()
-        conv2.lin_dst.weight.data = conv1.lin_r.weight.data.detach().clone()
-        conv2.att.data = conv1.att.data.flatten().detach().clone()
+        conv2.lin_src.weight.copy_(conv1.lin_l.weight)
+        conv2.lin_dst.weight.copy_(conv1.lin_r.weight)
+        conv2.att.copy_(conv1.att.flatten())
         if use_edge_attr:
-            conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
+            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
     if use_edge_index:
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
index 5b25e7dc334..d8190ea345f 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -66,18 +66,18 @@ def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
         for edge_type in conv2.edge_types:
             src_t, _, dst_t = edge_type
             if src_t == dst_t:
-                w_src[edge_type][:, :] = conv1.convs[edge_type].lin.weight[:, :]
+                w_src[edge_type].copy_(conv1.convs[edge_type].lin.weight)
             else:
-                w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
+                w_src[edge_type].copy_(conv1.convs[edge_type].lin_src.weight)
                 if w_dst[edge_type] is not None:
-                    w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
-
-            conv2.attn_weights[edge_type][: heads * out_channels] = conv1.convs[
-                edge_type
-            ].att_src.data.flatten()
-            conv2.attn_weights[edge_type][heads * out_channels :] = conv1.convs[
-                edge_type
-            ].att_dst.data.flatten()
+                    w_dst[edge_type].copy_(conv1.convs[edge_type].lin_dst.weight)
+
+            conv2.attn_weights[edge_type][: heads * out_channels].copy_(
+                conv1.convs[edge_type].att_src.flatten()
+            )
+            conv2.attn_weights[edge_type][heads * out_channels :].copy_(
+                conv1.convs[edge_type].att_dst.flatten()
+            )
 
     out1 = conv1(data.x_dict, data.edge_index_dict)
     out2 = conv2(data.x_dict, data.edge_index_dict)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
index fedc7edabe9..fc0aaf25b7b 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -60,12 +60,12 @@ def test_rgcn_conv_equality(
 
     with torch.no_grad():
         if root_weight:
-            conv2.weight.data[:-1] = conv1.weight.data
-            conv2.weight.data[-1] = conv1.root.data
+            conv2.weight[:-1].copy_(conv1.weight)
+            conv2.weight[-1].copy_(conv1.root)
         else:
-            conv2.weight.data = conv1.weight.data.detach().clone()
+            conv2.weight.copy_(conv1.weight)
         if num_bases is not None:
-            conv2.comp.data = conv1.comp.data.detach().clone()
+            conv2.comp.copy_(conv1.comp)
 
     out1 = conv1(x, edge_index, edge_type)
     if use_edge_index:
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
index 62a32a603b1..9d8d413c590 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -69,11 +69,11 @@ def test_sage_conv_equality(
 
     in_channels_src = conv2.in_channels_src
     with torch.no_grad():
-        conv2.lin.weight.data[:, :in_channels_src] = conv1.lin_l.weight.data
+        conv2.lin.weight[:, :in_channels_src].copy_(conv1.lin_l.weight)
         if root_weight:
-            conv2.lin.weight.data[:, in_channels_src:] = conv1.lin_r.weight.data
+            conv2.lin.weight[:, in_channels_src:].copy_(conv1.lin_r.weight)
         if bias:
-            conv2.lin.bias.data[:] = conv1.lin_l.bias.data
+            conv2.lin.bias.copy_(conv1.lin_l.bias)
 
     out1 = conv1(x, edge_index)
     out2 = conv2(x, csc, max_num_neighbors=max_num_neighbors)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index 85461cdeb38..1776b691c87 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -19,12 +19,13 @@
 
 
 @pytest.mark.parametrize("use_edge_index", [True, False])
+@pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_transformer_conv_equality(
-    use_edge_index, bipartite, concat, heads, graph, request
+    use_edge_index, use_edge_attr, bipartite, concat, heads, graph, request
 ):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
@@ -35,14 +36,6 @@ def test_transformer_conv_equality(
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
 
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        csc = CuGraphTransformerConv.to_csc(edge_index, size)
-
-    out_channels = 2
-    kwargs = dict(concat=concat, bias=False, root_weight=False)
-
     if bipartite:
         in_channels = (5, 3)
         x = (
@@ -52,20 +45,45 @@ def test_transformer_conv_equality(
     else:
         in_channels = 5
         x = torch.rand(size[0], in_channels, device="cuda")
+    out_channels = 2
+
+    if use_edge_attr:
+        edge_dim = 3
+        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
+    else:
+        edge_dim = edge_attr = None
+
+    if use_edge_index:
+        csc = EdgeIndex(edge_index, sparse_size=size)
+    else:
+        if use_edge_attr:
+            csc, edge_attr_perm = CuGraphTransformerConv.to_csc(
+                edge_index, size, edge_attr=edge_attr
+            )
+        else:
+            csc = CuGraphTransformerConv.to_csc(edge_index, size)
+            edge_attr_perm = None
+
+    kwargs = dict(concat=concat, bias=False, edge_dim=edge_dim, root_weight=False)
 
     conv1 = TransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
     conv2 = CuGraphTransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
 
     with torch.no_grad():
-        conv2.lin_query.weight.data = conv1.lin_query.weight.data.detach().clone()
-        conv2.lin_key.weight.data = conv1.lin_key.weight.data.detach().clone()
-        conv2.lin_value.weight.data = conv1.lin_value.weight.data.detach().clone()
-        conv2.lin_query.bias.data = conv1.lin_query.bias.data.detach().clone()
-        conv2.lin_key.bias.data = conv1.lin_key.bias.data.detach().clone()
-        conv2.lin_value.bias.data = conv1.lin_value.bias.data.detach().clone()
+        conv2.lin_query.weight.copy_(conv1.lin_query.weight)
+        conv2.lin_key.weight.copy_(conv1.lin_key.weight)
+        conv2.lin_value.weight.copy_(conv1.lin_value.weight)
+        conv2.lin_query.bias.copy_(conv1.lin_query.bias)
+        conv2.lin_key.bias.copy_(conv1.lin_key.bias)
+        conv2.lin_value.bias.copy_(conv1.lin_value.bias)
+        if use_edge_attr:
+            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
 
-    out1 = conv1(x, edge_index)
-    out2 = conv2(x, csc)
+    out1 = conv1(x, edge_index, edge_attr=edge_attr)
+    if use_edge_index:
+        out2 = conv2(x, csc, edge_attr=edge_attr)
+    else:
+        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
@@ -89,3 +107,8 @@ def test_transformer_conv_equality(
     assert torch.allclose(
         conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=ATOL
     )
+
+    if use_edge_attr:
+        assert torch.allclose(
+            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
+        )

From 4285b477b715744e64da13cc0bb182ba35caa7f6 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 17 Apr 2024 12:09:59 -0700
Subject: [PATCH 20/80] C API

---
 cpp/include/cugraph_c/sampling_algorithms.h   | 12 ++++
 cpp/src/c_api/uniform_neighbor_sampling.cpp   | 64 +++++++++++++------
 cpp/tests/c_api/create_graph_test.c           |  2 +-
 .../c_api/uniform_neighbor_sample_test.c      |  2 +
 4 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index 5760d2098aa..859eaca7f3b 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -236,6 +236,15 @@ typedef enum cugraph_compression_type_t {
 cugraph_error_code_t cugraph_sampling_options_create(cugraph_sampling_options_t** options,
                                                      cugraph_error_t** error);
 
+/**
+ * @ingroup samplingC
+ * @brief   Set flag to retain seeds (original sources)
+ *
+ * @param options - opaque pointer to the sampling options
+ * @param value - Boolean value to assign to the option
+ */
+void cugraph_sampling_set_retain_seeds(cugraph_sampling_options_t* options, bool_t value);
+
 /**
  * @ingroup samplingC
  * @brief   Set flag to renumber results
@@ -335,6 +344,8 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
  * output.  If specified then the all data from @p label_list[i] will be shuffled to rank @p.  This
  * cannot be specified unless @p start_vertex_labels is also specified
  * label_to_comm_rank[i].  If not specified then the output data will not be shuffled between ranks.
+ * @param [in]  label_offsets Device array of the offsets for each label in the seed list.  This
+ *                            parameter is only used with the retain_seeds option.
  * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm.
  *                           We only support fanout values of type INT32
  * @param [in/out] rng_state State of the random number generator, updated with each call
@@ -354,6 +365,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
   const cugraph_type_erased_device_array_view_t* start_vertex_labels,
   const cugraph_type_erased_device_array_view_t* label_list,
   const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
+  const cugraph_type_erased_device_array_view_t* label_offsets,
   const cugraph_type_erased_host_array_view_t* fan_out,
   cugraph_rng_state_t* rng_state,
   const cugraph_sampling_options_t* options,
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 100e81a5bd2..0dcafa6eab6 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -40,6 +40,7 @@ struct cugraph_sampling_options_t {
   bool_t renumber_results_{FALSE};
   cugraph_compression_type_t compression_type_{cugraph_compression_type_t::COO};
   bool_t compress_per_hop_{FALSE};
+  bool_t retain_seeds_{FALSE};
 };
 
 struct cugraph_sample_result_t {
@@ -68,6 +69,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertex_labels_{nullptr};
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_list_{nullptr};
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_to_comm_rank_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_offsets_{nullptr};
   cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr};
   cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
   cugraph::c_api::cugraph_sampling_options_t options_{};
@@ -81,6 +83,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
     cugraph_type_erased_device_array_view_t const* start_vertex_labels,
     cugraph_type_erased_device_array_view_t const* label_list,
     cugraph_type_erased_device_array_view_t const* label_to_comm_rank,
+    cugraph_type_erased_device_array_view_t const* label_offsets,
     cugraph_type_erased_host_array_view_t const* fan_out,
     cugraph_rng_state_t* rng_state,
     cugraph::c_api::cugraph_sampling_options_t options,
@@ -99,6 +102,9 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       label_to_comm_rank_(
         reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
           label_to_comm_rank)),
+      label_offsets_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+          label_offsets)),
       fan_out_(
         reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)),
       rng_state_(reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(rng_state)),
@@ -149,10 +155,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
 
       rmm::device_uvector<vertex_t> start_vertices(start_vertices_->size_, handle_.get_stream());
-      raft::copy(start_vertices.data(),
-                 start_vertices_->as_type<vertex_t>(),
-                 start_vertices.size(),
-                 handle_.get_stream());
 
       std::optional<rmm::device_uvector<label_t>> start_vertex_labels{std::nullopt};
 
@@ -267,8 +269,13 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
               std::move(edge_id),
               std::move(edge_type),
               std::move(hop),
-              std::nullopt,
-              std::nullopt,
+              options_.retain_seeds_
+                ? std::make_optional(raft::device_span<vertex_t const>{
+                    start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                : std::nullopt,
+              options_.retain_seeds_ ? std::make_optional(raft::device_span<size_t const>{
+                                         label_offsets_->as_type<size_t>(), label_offsets_->size_})
+                                     : std::nullopt,
               offsets ? std::make_optional(
                           raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
@@ -304,8 +311,13 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
               std::move(edge_id),
               std::move(edge_type),
               std::move(hop),
-              std::nullopt,
-              std::nullopt,
+              options_.retain_seeds_
+                ? std::make_optional(raft::device_span<vertex_t const>{
+                    start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                : std::nullopt,
+              options_.retain_seeds_ ? std::make_optional(raft::device_span<size_t const>{
+                                         label_offsets_->as_type<size_t>(), label_offsets_->size_})
+                                     : std::nullopt,
               offsets ? std::make_optional(
                           raft::device_span<size_t const>{offsets->data(), offsets->size()})
                       : std::nullopt,
@@ -402,6 +414,12 @@ extern "C" cugraph_error_code_t cugraph_sampling_options_create(
   return CUGRAPH_SUCCESS;
 }
 
+extern "C" void cugraph_sampling_set_retain_seeds(cugraph_sampling_options_t* options, bool_t value)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t*>(options);
+  internal_pointer->retain_seeds_ = value;
+}
+
 extern "C" void cugraph_sampling_set_renumber_results(cugraph_sampling_options_t* options,
                                                       bool_t value)
 {
@@ -871,6 +889,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
   const cugraph_type_erased_device_array_view_t* start_vertex_labels,
   const cugraph_type_erased_device_array_view_t* label_list,
   const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
+  const cugraph_type_erased_device_array_view_t* label_offsets,
   const cugraph_type_erased_host_array_view_t* fan_out,
   cugraph_rng_state_t* rng_state,
   const cugraph_sampling_options_t* options,
@@ -878,6 +897,13 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
   cugraph_sample_result_t** result,
   cugraph_error_t** error)
 {
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (label_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify label_offsets if retain_seeds is true",
+               *error);
+
   CAPI_EXPECTS((start_vertex_labels == nullptr) ||
                  (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
                     start_vertex_labels)
@@ -911,16 +937,16 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
     "fan_out should be of type int",
     *error);
 
-  uniform_neighbor_sampling_functor functor{
-    handle,
-    graph,
-    start_vertices,
-    start_vertex_labels,
-    label_list,
-    label_to_comm_rank,
-    fan_out,
-    rng_state,
-    *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options),
-    do_expensive_check};
+  uniform_neighbor_sampling_functor functor{handle,
+                                            graph,
+                                            start_vertices,
+                                            start_vertex_labels,
+                                            label_list,
+                                            label_to_comm_rank,
+                                            label_offsets,
+                                            fan_out,
+                                            rng_state,
+                                            std::move(options_cpp),
+                                            do_expensive_check};
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
diff --git a/cpp/tests/c_api/create_graph_test.c b/cpp/tests/c_api/create_graph_test.c
index 11da2eb8589..c28615d6f9d 100644
--- a/cpp/tests/c_api/create_graph_test.c
+++ b/cpp/tests/c_api/create_graph_test.c
@@ -263,7 +263,7 @@ int test_create_sg_graph_csr()
   cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
 
   ret_code = cugraph_uniform_neighbor_sample(
-                                              handle, graph, d_start_view, NULL, NULL, NULL, h_fan_out_view, rng_state, sampling_options, FALSE, &result, &ret_error);
+                                              handle, graph, d_start_view, NULL, NULL, NULL, NULL, h_fan_out_view, rng_state, sampling_options, FALSE, &result, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");
diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c
index 15b2e937661..34bb6f7b6ef 100644
--- a/cpp/tests/c_api/uniform_neighbor_sample_test.c
+++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c
@@ -139,6 +139,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
                                              d_start_labels_view,
                                              NULL,
                                              NULL,
+                                             NULL,
                                              h_fan_out_view,
                                              rng_state,
                                              sampling_options,
@@ -640,6 +641,7 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha
                                               d_start_labels_view,
                                               NULL,
                                               NULL,
+                                              NULL,
                                               h_fan_out_view,
                                               rng_state,
                                               sampling_options,

From d4850ee34c4217fac6ca614be46ed99cd7d968e9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 17 Apr 2024 12:29:16 -0700
Subject: [PATCH 21/80] style

---
 cpp/tests/c_api/create_graph_test.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/c_api/create_graph_test.c b/cpp/tests/c_api/create_graph_test.c
index 1d3c63368d6..41b8691e79c 100644
--- a/cpp/tests/c_api/create_graph_test.c
+++ b/cpp/tests/c_api/create_graph_test.c
@@ -262,8 +262,19 @@ int test_create_sg_graph_csr()
   cugraph_sampling_set_compression_type(sampling_options, compression);
   cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
 
-  ret_code = cugraph_uniform_neighbor_sample(
-                                              handle, graph, d_start_view, NULL, NULL, NULL, NULL, h_fan_out_view, rng_state, sampling_options, FALSE, &result, &ret_error);
+  ret_code = cugraph_uniform_neighbor_sample(handle,
+                                             graph,
+                                             d_start_view,
+                                             NULL,
+                                             NULL,
+                                             NULL,
+                                             NULL,
+                                             h_fan_out_view,
+                                             rng_state,
+                                             sampling_options,
+                                             FALSE,
+                                             &result,
+                                             &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");

From 15dce372a1816600b818fcc30157b484d291e9ba Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 17 Apr 2024 12:47:44 -0700
Subject: [PATCH 22/80] pylibcugraph

---
 .../pylibcugraph/_cugraph_c/algorithms.pxd    |  6 ++++
 .../_cugraph_c/sampling_algorithms.pxd        |  3 +-
 .../pylibcugraph/uniform_neighbor_sample.pyx  | 29 +++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
index b0e7ffaf82d..4da7c4328fd 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
@@ -292,6 +292,12 @@ cdef extern from "cugraph_c/algorithms.h":
             bool_t value,
         )
 
+    cdef void \
+        cugraph_sampling_set_retain_seeds(
+            cugraph_sampling_options_t* options,
+            bool_t value,
+        )
+
     cdef void \
         cugraph_sampling_set_with_replacement(
             cugraph_sampling_options_t* options,
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
index c32b57f8621..dbd3ef4b7e1 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -50,6 +50,7 @@ cdef extern from "cugraph_c/sampling_algorithms.h":
         const cugraph_type_erased_device_array_view_t* start_vertex_labels,
         const cugraph_type_erased_device_array_view_t* label_list,
         const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
+        const cugraph_type_erased_device_array_view_t* label_offsets,
         const cugraph_type_erased_host_array_view_t* fan_out,
         cugraph_rng_state_t* rng_state,
         const cugraph_sampling_options_t* options,
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index b4145a80095..a42086dc099 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -49,6 +49,7 @@ from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sampling_set_renumber_results,
     cugraph_sampling_set_compress_per_hop,
     cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
 )
 from pylibcugraph._cugraph_c.sampling_algorithms cimport (
     cugraph_uniform_neighbor_sample,
@@ -89,10 +90,12 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
                             batch_id_list=None,
                             label_list=None,
                             label_to_output_comm_rank=None,
+                            label_offsets=None,
                             prior_sources_behavior=None,
                             deduplicate_sources=False,
                             return_hops=False,
                             renumber=False,
+                            retain_seeds=False,
                             compression='COO',
                             compress_per_hop=False,
                             random_state=None,
@@ -143,6 +146,9 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         worker that should hold results for that batch id.
         Defaults to NULL (does nothing)
 
+    label_offsets: list[int] (Optional)
+        Offsets of each label within the start vertex list.
+
     prior_sources_behavior: str (Optional)
         Options are "carryover", and "exclude".
         Default will leave the source list as-is.
@@ -160,6 +166,11 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         per-batch basis and return the renumber map and batch offsets
         in additional to the standard returns.
 
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
     compression: str (Optional)
         Options: COO (default), CSR, CSC, DCSR, DCSR
         Sets the compression format for the returned samples.
@@ -234,6 +245,11 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         cai_label_to_output_comm_rank_ptr = \
             label_to_output_comm_rank.__cuda_array_interface__['data'][0]
 
+    cdef uintptr_t cai_label_offsets_ptr
+    if label_offsets is not None:
+        cai_label_offsets_ptr = \
+            label_offsets.__cuda_array_interface__['data'][0]
+
     cdef uintptr_t ai_fan_out_ptr = \
         h_fan_out.__array_interface__["data"][0]
 
@@ -270,6 +286,17 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
                 get_c_type_from_numpy_type(label_to_output_comm_rank.dtype)
             )
 
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+        label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_label_offsets_ptr,
+                len(label_offsets),
+                get_c_type_from_numpy_type(label_offsets.dtype)
+            )
+
     cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = \
         cugraph_type_erased_host_array_view_create(
             <void*>ai_fan_out_ptr,
@@ -323,6 +350,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
     cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
     cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
 
     error_code = cugraph_uniform_neighbor_sample(
         c_resource_handle_ptr,
@@ -331,6 +359,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         batch_id_ptr,
         label_list_ptr,
         label_to_output_comm_rank_ptr,
+        label_offsets_ptr,
         fan_out_ptr,
         rng_state_ptr,
         sampling_options,

From 5243a4dccbe3655d905c6529805faa50cd9952a9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 17 Apr 2024 12:54:27 -0700
Subject: [PATCH 23/80] cugraph

---
 .../cugraph/dask/sampling/uniform_neighbor_sample.py | 12 +++++++++++-
 .../cugraph/sampling/uniform_neighbor_sample.py      |  6 ++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
index 03746561817..8c3191064ae 100644
--- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -167,6 +167,7 @@ def _call_plc_uniform_neighbor_sample(
     prior_sources_behavior=None,
     deduplicate_sources=False,
     renumber=False,
+    retain_seeds=False,
     use_legacy_names=True,
     include_hop_column=True,
     compress_per_hop=False,
@@ -200,6 +201,7 @@ def _call_plc_uniform_neighbor_sample(
         deduplicate_sources=deduplicate_sources,
         return_hops=return_hops,
         renumber=renumber,
+        retain_seeds=retain_seeds,
         compression=compression,
         compress_per_hop=compress_per_hop,
         return_dict=True,
@@ -241,6 +243,7 @@ def _mg_call_plc_uniform_neighbor_sample(
     prior_sources_behavior=None,
     deduplicate_sources=False,
     renumber=False,
+    retain_seeds=False,
     use_legacy_names=True,
     include_hop_column=True,
     compress_per_hop=False,
@@ -276,6 +279,7 @@ def _mg_call_plc_uniform_neighbor_sample(
             prior_sources_behavior=prior_sources_behavior,
             deduplicate_sources=deduplicate_sources,
             renumber=renumber,
+            retain_seeds=retain_seeds,
             use_legacy_names=use_legacy_names,  # remove in 23.12
             include_hop_column=include_hop_column,  # remove in 23.12
             compress_per_hop=compress_per_hop,
@@ -359,6 +363,7 @@ def uniform_neighbor_sample(
     prior_sources_behavior: str = None,
     deduplicate_sources: bool = False,
     renumber: bool = False,
+    retain_seeds: bool = False,
     use_legacy_names=True,  # deprecated
     compress_per_hop=False,
     compression="COO",
@@ -440,6 +445,10 @@ def uniform_neighbor_sample(
         will return the renumber map and renumber map offsets
         as an additional dataframe.
 
+    retain_seeds: bool, optional (default=False)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+
     use_legacy_names: bool, optional (default=True)
         Whether to use the legacy column names (sources, destinations).
         If True, will use "sources" and "destinations" as the column names.
@@ -670,6 +679,7 @@ def uniform_neighbor_sample(
         "prior_sources_behavior": prior_sources_behavior,
         "deduplicate_sources": deduplicate_sources,
         "renumber": renumber,
+        "retain_seeds": retain_seeds,
         "use_legacy_names": use_legacy_names,
         "include_hop_column": include_hop_column,
         "compress_per_hop": compress_per_hop,
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index 86b33594ed7..45911e3248b 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -71,6 +71,7 @@ def uniform_neighbor_sample(
     prior_sources_behavior: str = None,
     deduplicate_sources: bool = False,
     renumber: bool = False,
+    retain_seeds: bool = False,
     use_legacy_names: bool = True,  # deprecated
     compress_per_hop: bool = False,
     compression: str = "COO",
@@ -142,6 +143,10 @@ def uniform_neighbor_sample(
         will return the renumber map and renumber map offsets
         as an additional dataframe.
 
+    retain_seeds: bool, optional (default=False)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+
     use_legacy_names: bool, optional (default=True)
         Whether to use the legacy column names (sources, destinations).
         If True, will use "sources" and "destinations" as the column names.
@@ -349,6 +354,7 @@ def uniform_neighbor_sample(
         deduplicate_sources=deduplicate_sources,
         return_hops=return_hops,
         renumber=renumber,
+        retain_seeds=retain_seeds,
         compression=compression,
         compress_per_hop=compress_per_hop,
         return_dict=True,

From ba6ee3bea2700361206bf15adaa2124cf17d7382 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 18 Apr 2024 10:39:33 -0700
Subject: [PATCH 24/80] retention

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp              | 3 +++
 .../cugraph/cugraph/sampling/uniform_neighbor_sample.py  | 9 ++++++++-
 .../pylibcugraph/uniform_neighbor_sample.pyx             | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 0dcafa6eab6..c2d7c6dee2a 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -29,6 +29,8 @@
 
 #include <raft/core/handle.hpp>
 
+#include <iostream>
+
 namespace cugraph {
 namespace c_api {
 
@@ -250,6 +252,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       if (options_.renumber_results_) {
         if (options_.compression_type_ == cugraph_compression_type_t::COO) {
           // COO
+          std::cout << "retain seeds? " << options_.retain_seeds_ << std::endl;
 
           rmm::device_uvector<vertex_t> output_majors(0, handle_.get_stream());
           rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index 45911e3248b..bab5e90e95c 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -72,6 +72,7 @@ def uniform_neighbor_sample(
     deduplicate_sources: bool = False,
     renumber: bool = False,
     retain_seeds: bool = False,
+    label_offsets: Sequence=None,
     use_legacy_names: bool = True,  # deprecated
     compress_per_hop: bool = False,
     compression: str = "COO",
@@ -147,6 +148,11 @@ def uniform_neighbor_sample(
         If True, will retain the original seeds (original source vertices)
         in the output even if they do not have outgoing neighbors.
 
+    label_offsets: integer sequence, optional (default=None)
+        Offsets of each label within the start vertex list.
+        Only used if retain_seeds is True.  Required if retain_seeds
+        is True.
+
     use_legacy_names: bool, optional (default=True)
         Whether to use the legacy column names (sources, destinations).
         If True, will use "sources" and "destinations" as the column names.
@@ -347,7 +353,7 @@ def uniform_neighbor_sample(
         else None,
         h_fan_out=fanout_vals,
         with_replacement=with_replacement,
-        do_expensive_check=False,
+        do_expensive_check=True,
         with_edge_properties=with_edge_properties,
         random_state=random_state,
         prior_sources_behavior=prior_sources_behavior,
@@ -355,6 +361,7 @@ def uniform_neighbor_sample(
         return_hops=return_hops,
         renumber=renumber,
         retain_seeds=retain_seeds,
+        label_offsets=label_offsets,
         compression=compression,
         compress_per_hop=compress_per_hop,
         return_dict=True,
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index a42086dc099..81849e9fdc1 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -221,6 +221,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     assert_CAI_type(batch_id_list, "batch_id_list", True)
     assert_CAI_type(label_list, "label_list", True)
     assert_CAI_type(label_to_output_comm_rank, "label_to_output_comm_rank", True)
+    assert_CAI_type(label_offsets, "label_offsets", True)
     assert_AI_type(h_fan_out, "h_fan_out")
 
     cdef cugraph_sample_result_t* result_ptr

From c01b548ccb47adbeb138270a68ba03fda0516259 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 18 Apr 2024 12:11:33 -0700
Subject: [PATCH 25/80] fix copy bug

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp                | 7 ++++---
 .../pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx  | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index c2d7c6dee2a..978a254b80c 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -29,8 +29,6 @@
 
 #include <raft/core/handle.hpp>
 
-#include <iostream>
-
 namespace cugraph {
 namespace c_api {
 
@@ -157,6 +155,10 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
 
       rmm::device_uvector<vertex_t> start_vertices(start_vertices_->size_, handle_.get_stream());
+      raft::copy(start_vertices.data(),
+            start_vertices_->as_type<vertex_t>(),
+            start_vertices.size(),
+            handle_.get_stream());
 
       std::optional<rmm::device_uvector<label_t>> start_vertex_labels{std::nullopt};
 
@@ -252,7 +254,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
       if (options_.renumber_results_) {
         if (options_.compression_type_ == cugraph_compression_type_t::COO) {
           // COO
-          std::cout << "retain seeds? " << options_.retain_seeds_ << std::endl;
 
           rmm::device_uvector<vertex_t> output_majors(0, handle_.get_stream());
           rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index 81849e9fdc1..f002622f497 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -377,6 +377,8 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     cugraph_type_erased_host_array_view_free(fan_out_ptr)
     if batch_id_list is not None:
         cugraph_type_erased_device_array_view_free(batch_id_ptr)
+    if label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(label_offsets_ptr)
 
     # Have the SamplingResult instance assume ownership of the result data.
     result = SamplingResult()

From 38271f61548ad917c322a5624ca30add7162c467 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 18 Apr 2024 14:59:52 -0700
Subject: [PATCH 26/80] fixes, give up on mg support outside of
 pylibcugraph/dist sampler

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp          |  2 ++
 .../cugraph/dask/sampling/uniform_neighbor_sample.py | 12 +-----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index 978a254b80c..a0cadf1181d 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -29,6 +29,8 @@
 
 #include <raft/core/handle.hpp>
 
+#include <iostream>
+
 namespace cugraph {
 namespace c_api {
 
diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
index 8c3191064ae..03746561817 100644
--- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -167,7 +167,6 @@ def _call_plc_uniform_neighbor_sample(
     prior_sources_behavior=None,
     deduplicate_sources=False,
     renumber=False,
-    retain_seeds=False,
     use_legacy_names=True,
     include_hop_column=True,
     compress_per_hop=False,
@@ -201,7 +200,6 @@ def _call_plc_uniform_neighbor_sample(
         deduplicate_sources=deduplicate_sources,
         return_hops=return_hops,
         renumber=renumber,
-        retain_seeds=retain_seeds,
         compression=compression,
         compress_per_hop=compress_per_hop,
         return_dict=True,
@@ -243,7 +241,6 @@ def _mg_call_plc_uniform_neighbor_sample(
     prior_sources_behavior=None,
     deduplicate_sources=False,
     renumber=False,
-    retain_seeds=False,
     use_legacy_names=True,
     include_hop_column=True,
     compress_per_hop=False,
@@ -279,7 +276,6 @@ def _mg_call_plc_uniform_neighbor_sample(
             prior_sources_behavior=prior_sources_behavior,
             deduplicate_sources=deduplicate_sources,
             renumber=renumber,
-            retain_seeds=retain_seeds,
             use_legacy_names=use_legacy_names,  # remove in 23.12
             include_hop_column=include_hop_column,  # remove in 23.12
             compress_per_hop=compress_per_hop,
@@ -363,7 +359,6 @@ def uniform_neighbor_sample(
     prior_sources_behavior: str = None,
     deduplicate_sources: bool = False,
     renumber: bool = False,
-    retain_seeds: bool = False,
     use_legacy_names=True,  # deprecated
     compress_per_hop=False,
     compression="COO",
@@ -445,10 +440,6 @@ def uniform_neighbor_sample(
         will return the renumber map and renumber map offsets
         as an additional dataframe.
 
-    retain_seeds: bool, optional (default=False)
-        If True, will retain the original seeds (original source vertices)
-        in the output even if they do not have outgoing neighbors.
-
     use_legacy_names: bool, optional (default=True)
         Whether to use the legacy column names (sources, destinations).
         If True, will use "sources" and "destinations" as the column names.
@@ -679,7 +670,6 @@ def uniform_neighbor_sample(
         "prior_sources_behavior": prior_sources_behavior,
         "deduplicate_sources": deduplicate_sources,
         "renumber": renumber,
-        "retain_seeds": retain_seeds,
         "use_legacy_names": use_legacy_names,
         "include_hop_column": include_hop_column,
         "compress_per_hop": compress_per_hop,

From 49b908377ceebea3ab98295c1f89fc7e8059d638 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 18 Apr 2024 15:26:28 -0700
Subject: [PATCH 27/80] API

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp   |  6 ++---
 .../c_api/uniform_neighbor_sample_test.c      | 24 +++++++++----------
 .../sampling/uniform_neighbor_sample.py       |  2 +-
 python/nx-cugraph/README.md                   | 12 +++++-----
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index a0cadf1181d..a66d8ef1d6b 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -158,9 +158,9 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
 
       rmm::device_uvector<vertex_t> start_vertices(start_vertices_->size_, handle_.get_stream());
       raft::copy(start_vertices.data(),
-            start_vertices_->as_type<vertex_t>(),
-            start_vertices.size(),
-            handle_.get_stream());
+                 start_vertices_->as_type<vertex_t>(),
+                 start_vertices.size(),
+                 handle_.get_stream());
 
       std::optional<rmm::device_uvector<label_t>> start_vertex_labels{std::nullopt};
 
diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c
index 3ca227019a4..451dbca51a7 100644
--- a/cpp/tests/c_api/uniform_neighbor_sample_test.c
+++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c
@@ -657,18 +657,18 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha
   cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
 
   ret_code = cugraph_uniform_neighbor_sample(handle,
-                                              graph,
-                                              d_start_view,
-                                              d_start_labels_view,
-                                              NULL,
-                                              NULL,
-                                              NULL,
-                                              h_fan_out_view,
-                                              rng_state,
-                                              sampling_options,
-                                              FALSE,
-                                              &result,
-                                              &ret_error);
+                                             graph,
+                                             d_start_view,
+                                             d_start_labels_view,
+                                             NULL,
+                                             NULL,
+                                             NULL,
+                                             h_fan_out_view,
+                                             rng_state,
+                                             sampling_options,
+                                             FALSE,
+                                             &result,
+                                             &ret_error);
 
 #ifdef NO_CUGRAPH_OPS
   TEST_ASSERT(
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index bab5e90e95c..eafadfa4ff0 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -72,7 +72,7 @@ def uniform_neighbor_sample(
     deduplicate_sources: bool = False,
     renumber: bool = False,
     retain_seeds: bool = False,
-    label_offsets: Sequence=None,
+    label_offsets: Sequence = None,
     use_legacy_names: bool = True,  # deprecated
     compress_per_hop: bool = False,
     compression: str = "COO",
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 75b5c1c5aa9..df80ae56a96 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From 85f40bcf08d409d8b60214aef771d664d75f796e Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 18 Apr 2024 15:37:04 -0700
Subject: [PATCH 28/80] retain seeds test

---
 .../sampling/test_uniform_neighbor_sample.py  | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
index 560b80993d9..e6b491a1a75 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
@@ -963,6 +963,49 @@ def test_uniform_neighbor_sample_csr_csc_local(hops, seed):
             assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
 
 
+def test_uniform_neighbor_sample_retain_seeds():
+    src = cupy.array([0, 1, 2, 3, 4, 5], dtype='int64')
+    dst = cupy.array([2, 3, 1, 7, 5, 6], dtype='int64')
+
+    seeds = cupy.array([6, 0, 1, 7], dtype='int64')
+    batch = cupy.array([0, 0, 1, 1], dtype='int32')
+    batch_offsets = cupy.array([0, 2, 4], dtype='int64')
+
+    fanout = [2, 2]
+
+    df = cudf.DataFrame({'src':src, 'dst':dst})
+
+    G = cugraph.MultiGraph(directed=True)
+    G.from_cudf_edgelist(df, source='src', destination='dst')
+
+    batch_df = cudf.DataFrame({
+        'seeds': seeds,
+        'batch': batch
+    })
+    batch_offsets_s = cudf.Series(batch_offsets, name='batch_offsets')
+    results, offsets, renumber_map = cugraph.uniform_neighbor_sample(
+        G,
+        batch_df,
+        fanout,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=True,
+        random_state=62,
+        return_offsets=True,
+        label_offsets=batch_offsets_s,
+        return_hops=True,
+        prior_sources_behavior='exclude',
+        deduplicate_sources=True,
+        renumber=True,
+        retain_seeds=True,
+        compress_per_hop=False
+    )
+
+    assert(offsets.renumber_map_offsets.dropna().values_host.tolist() == [0, 4, 7])
+    assert(renumber_map.renumber_map.values_host[[0, 1]].tolist() == [0, 6])
+    assert(renumber_map.renumber_map.values_host[[4, 5]].tolist() == [1, 7])
+
+
 @pytest.mark.sg
 @pytest.mark.skip(reason="needs to be written!")
 def test_uniform_neighbor_sample_dcsr_dcsc_global():

From 3708870d06e1d910e03228e6baa7639551a3fc3a Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 18 Apr 2024 15:37:40 -0700
Subject: [PATCH 29/80] c

---
 .../sampling/test_uniform_neighbor_sample.py  | 31 +++++++++----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
index e6b491a1a75..304ead6fea9 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
@@ -964,25 +964,22 @@ def test_uniform_neighbor_sample_csr_csc_local(hops, seed):
 
 
 def test_uniform_neighbor_sample_retain_seeds():
-    src = cupy.array([0, 1, 2, 3, 4, 5], dtype='int64')
-    dst = cupy.array([2, 3, 1, 7, 5, 6], dtype='int64')
+    src = cupy.array([0, 1, 2, 3, 4, 5], dtype="int64")
+    dst = cupy.array([2, 3, 1, 7, 5, 6], dtype="int64")
 
-    seeds = cupy.array([6, 0, 1, 7], dtype='int64')
-    batch = cupy.array([0, 0, 1, 1], dtype='int32')
-    batch_offsets = cupy.array([0, 2, 4], dtype='int64')
+    seeds = cupy.array([6, 0, 1, 7], dtype="int64")
+    batch = cupy.array([0, 0, 1, 1], dtype="int32")
+    batch_offsets = cupy.array([0, 2, 4], dtype="int64")
 
     fanout = [2, 2]
 
-    df = cudf.DataFrame({'src':src, 'dst':dst})
+    df = cudf.DataFrame({"src": src, "dst": dst})
 
     G = cugraph.MultiGraph(directed=True)
-    G.from_cudf_edgelist(df, source='src', destination='dst')
+    G.from_cudf_edgelist(df, source="src", destination="dst")
 
-    batch_df = cudf.DataFrame({
-        'seeds': seeds,
-        'batch': batch
-    })
-    batch_offsets_s = cudf.Series(batch_offsets, name='batch_offsets')
+    batch_df = cudf.DataFrame({"seeds": seeds, "batch": batch})
+    batch_offsets_s = cudf.Series(batch_offsets, name="batch_offsets")
     results, offsets, renumber_map = cugraph.uniform_neighbor_sample(
         G,
         batch_df,
@@ -994,16 +991,16 @@ def test_uniform_neighbor_sample_retain_seeds():
         return_offsets=True,
         label_offsets=batch_offsets_s,
         return_hops=True,
-        prior_sources_behavior='exclude',
+        prior_sources_behavior="exclude",
         deduplicate_sources=True,
         renumber=True,
         retain_seeds=True,
-        compress_per_hop=False
+        compress_per_hop=False,
     )
 
-    assert(offsets.renumber_map_offsets.dropna().values_host.tolist() == [0, 4, 7])
-    assert(renumber_map.renumber_map.values_host[[0, 1]].tolist() == [0, 6])
-    assert(renumber_map.renumber_map.values_host[[4, 5]].tolist() == [1, 7])
+    assert offsets.renumber_map_offsets.dropna().values_host.tolist() == [0, 4, 7]
+    assert renumber_map.renumber_map.values_host[[0, 1]].tolist() == [0, 6]
+    assert renumber_map.renumber_map.values_host[[4, 5]].tolist() == [1, 7]
 
 
 @pytest.mark.sg

From cdda7944a09f13d272d543b05ad4bf8ba8aa007f Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 18 Apr 2024 15:42:57 -0700
Subject: [PATCH 30/80] revert readme

---
 python/nx-cugraph/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index df80ae56a96..75b5c1c5aa9 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From e77b43ea000a929e81e3b56977293b9b45f27008 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 19 Apr 2024 10:15:37 -0700
Subject: [PATCH 31/80] remove debug header

---
 cpp/src/c_api/uniform_neighbor_sampling.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index a66d8ef1d6b..45609fc0e01 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -29,8 +29,6 @@
 
 #include <raft/core/handle.hpp>
 
-#include <iostream>
-
 namespace cugraph {
 namespace c_api {
 

From ff4a4547c7b89a2285d03de387ec3b4406f60cc1 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 19 Apr 2024 12:29:30 -0700
Subject: [PATCH 32/80] first pass at new dist graph store

---
 conda/recipes/cugraph-pyg/meta.yaml           |  1 +
 dependencies.yaml                             |  2 +
 .../cugraph_pyg/data/graph_store.py           | 83 +++++++++++++++++++
 3 files changed, 86 insertions(+)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/data/graph_store.py

diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 4ada5e31211..bb2fd3246fa 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -35,6 +35,7 @@ requirements:
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
     - pyg >=2.3,<2.5
+    - tensordict >=0.1.2
 
 tests:
   imports:
diff --git a/dependencies.yaml b/dependencies.yaml
index 9dca069ea33..7cf27bccaaa 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -481,6 +481,7 @@ dependencies:
         packages:
           - *numba
           - *numpy
+          - *tensordict
       - output_types: [pyproject]
         packages:
           - *cugraph
@@ -566,6 +567,7 @@ dependencies:
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - pyg>=2.4.0
+          - tensordict>=0.1.2
 
   depends_on_rmm:
     common:
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
new file mode 100644
index 00000000000..f1292d9b4b7
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -0,0 +1,83 @@
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from typing import Union, Optional
+
+import numpy as np
+import cupy
+import cudf
+import pandas
+
+# Have to use import_optional even though these are required
+# dependencies in order to build properly.
+torch_geometric = import_optional('torch_geometric')
+torch = import_optional('torch')
+tensordict = import_optional('tensordict')
+
+GraphStore = object if isinstance(torch_geometric, MissingModule) else torch_geometric.GraphStore
+TensorType = Union['torch.Tensor', cupy.ndarray, np.ndarray, cudf.Series, pandas.Series]
+
+class DistGraphStore(GraphStore):
+    """
+    This object uses lazy graph creation.  Users can repeatedly call
+    put_edge_index, and the tensors won't be converted into a cuGraph
+    graph until one is needed (i.e. when creating a loader).
+    """
+
+    def __init__(self, ):
+        self._edge_indices = tensordict.TensorDict({}, batch_size=(2,))
+        self._sizes = {}
+        self.__graph = None
+
+    def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', edge_attr:'torch_geometric.data.EdgeAttr') ->bool:
+        if edge_attr.layout != 'coo':
+            raise ValueError("Only COO format supported")
+
+        if isinstance(edge_index, (cupy.ndarray, cudf.Series)):
+            edge_index = torch.as_tensor(edge_index, device='cuda')
+        elif isinstance(edge_index, (np.ndarray)):
+            edge_index = torch.as_tensor(edge_index, device='cpu')
+        elif isinstance(edge_index, pandas.Series):
+            edge_index = torch.as_tensor(edge_index.values, device='cpu')
+        elif isinstance(edge_index, cudf.Series):
+            edge_index = torch.as_tensor(edge_index.values, device='cuda')
+        
+        self._edge_indices[edge_attr.edge_type] = torch.stack(edge_index)
+        self._sizes[edge_attr.edge_type] = edge_attr.size
+
+        # invalidate the graph
+        self.__graph = None
+        return True
+
+    def _get_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->Optional['torch_geometric.typing.EdgeTensorType']:
+        ei = torch_geometric.EdgeIndex(
+            self._edge_indices[edge_attr.edge_type]
+        )
+        
+        
+        if edge_attr.layout == 'csr':
+            return ei.sort_by('row').values.get_csr()
+        elif edge_attr.layout == 'csc':
+            return ei.sort_by('col').values.get_csc()
+
+        return ei
+
+    def _remove_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->bool:
+        del self._edge_indices[edge_attr.edge_type]
+        
+        # invalidate the graph
+        self.__graph = None
+        return True
+
+    def get_all_edge_attrs(self) -> List['torch_geometric.data.EdgeAttr']:
+        attrs = []
+        for et in self._edge_indices.keys(leaves_only=True, include_nested=True):
+            attrs.append(
+                torch_geometric.data.EdgeAttr(
+                    edge_type=et,
+                    layout='coo',
+                    is_sorted=False,
+                    size=self._sizes[et]
+                )
+            )
+        
+        return attrs
\ No newline at end of file

From ae82a771dab03f0b5e7df4b94408334d233223a3 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 1 May 2024 10:36:41 -0700
Subject: [PATCH 33/80] pyg dist

---
 .../cugraph-pyg/cugraph_pyg/data/__init__.py  |  11 +-
 .../{cugraph_store.py => dask_graph_store.py} |   2 +-
 .../cugraph_pyg/data/feature_store.py         | 100 +++++++++
 .../cugraph_pyg/data/graph_store.py           | 138 +++++++++++--
 .../examples/cugraph_dist_sampling_mg.py      |   2 +-
 .../examples/cugraph_dist_sampling_sg.py      |   4 +-
 .../cugraph_pyg/loader/__init__.py            |  12 +-
 ...aph_node_loader.py => dask_node_loader.py} |  20 +-
 .../cugraph_pyg/loader/neighbor_loader.py     | 195 ++++++++++++++++++
 .../cugraph_pyg/loader/node_loader.py         | 102 +++++++++
 .../cugraph_pyg/sampler/__init__.py           |   4 +-
 .../cugraph_pyg/sampler/sampler.py            | 136 ++++++++++++
 .../{cugraph_sampler.py => sampler_utils.py}  |   0
 python/cugraph/cugraph/gnn/__init__.py        |   1 +
 .../cugraph/gnn/data_loading/__init__.py      |   1 +
 .../cugraph/gnn/data_loading/dist_sampler.py  | 160 +++++++++++++-
 16 files changed, 847 insertions(+), 41 deletions(-)
 rename python/cugraph-pyg/cugraph_pyg/data/{cugraph_store.py => dask_graph_store.py} (99%)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/data/feature_store.py
 rename python/cugraph-pyg/cugraph_pyg/loader/{cugraph_node_loader.py => dask_node_loader.py} (97%)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
 rename python/cugraph-pyg/cugraph_pyg/sampler/{cugraph_sampler.py => sampler_utils.py} (100%)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
index 66a9843c047..73679ea056a 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -11,4 +11,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.data.cugraph_store import CuGraphStore
+import warnings
+
+from cugraph_pyg.data.dask_graph_store import DaskGraphStore
+from cugraph_pyg.data.graph_store import GraphStore
+
+def CuGraphStore(*args, **kwargs):
+    warnings.warn(
+        "CuGraphStore has been renamed to DaskGraphStore"
+    )
+    return DaskGraphStore(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
similarity index 99%
rename from python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
rename to python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
index 354eea8ee6b..40c979d5b0b 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
@@ -199,7 +199,7 @@ def cast(cls, *args, **kwargs):
         return cls(*args, **kwargs)
 
 
-class CuGraphStore:
+class DaskGraphStore:
     """
     Duck-typed version of PyG's GraphStore and FeatureStore.
     """
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
new file mode 100644
index 00000000000..d939ef51dbd
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from typing import Optional, Tuple, List
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional('torch')
+torch_geometric = import_optional('torch_geometric')
+tensordict = import_optional('tensordict')
+
+class TensorDictFeatureStore(object if isinstance(torch_geometric, MissingModule) else torch_geometric.data.FeatureStore):
+    """
+    A basic implementation of the PyG FeatureStore interface that stores
+    feature data in a single TensorDict.  This type of feature store is
+    not distributed, so each node will have to load the entire graph's
+    features into memory.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.__features = {}
+    
+    def _put_tensor(self, tensor:'torch_geometric.typing.FeatureTensorType', attr: 'torch_geometric.data.feature_store.TensorAttr') ->bool:
+        if attr.group_name in self.__features:
+            td = self.__features[attr.group_name]
+            batch_size = td.batch_size
+
+            if attr.is_set('index'):
+                if attr.attr_name in td.keys():
+                    if attr.index.shape[0] != batch_size:
+                        raise ValueError(
+                            f"Leading size of index tensor does not match existing tensors for group name {attr.group_name};"
+                            f" Expected {batch_size}, got {attr.index.shape[0]}"
+                        )
+                    td[attr.attr_name][attr.index] = tensor
+                    return True
+                else:
+                    warnings.warn(f"Ignoring index parameter (attribute does not exist for group {attr.group_name})")
+
+            if tensor.shape[0] != batch_size:
+                raise ValueError(
+                    f"Leading size of input tensor does not match existing tensors for group name {attr.group_name};"
+                    f" Expected {batch_size}, got {tensor.shape[0]}"
+                )
+        else:
+            batch_size = tensor.shape[0]
+            self.__features[attr.group_name] = tensordict.TensorDict({}, batch_size=batch_size)
+
+        self.__features[attr.group_name][attr.attr_name] = tensor
+        return True
+    
+    def _get_tensor(self, attr: 'torch_geometric.data.feature_store.TensorAttr') -> Optional['torch_geometric.typing.FeatureTensorType']:
+        if attr.group_name not in self.__features:
+            return None
+    
+        if attr.attr_name not in self.__features[attr.group_name]:
+            return None
+    
+        tensor = self.__features[attr.group_name][attr.attr_name]
+        return tensor if (attr.index is None or (not attr.is_set('index'))) else tensor[attr.index]
+        
+    def _remove_tensor(self, attr: 'torch_geometric.data.feature_store.TensorAttr') -> bool:
+        if attr.group_name not in self.__features:
+            return False
+    
+        if attr.attr_name not in self.__features[attr.group_name]:
+            return False
+    
+        del self.__features[attr.group_name][attr.attr_name]
+        return True
+
+    def _get_tensor_size(self, attr: 'torch_geometric.data.feature_store.TensorAttr') -> Tuple:
+        return self._get_tensor(attr).size()
+
+    def get_all_tensor_attr(self) -> List['torch_geometric.data.feature_store.TensorAttr']:
+        attrs = []
+        for group_name, td in self.__features.items():
+            for attr_name in td.keys():
+                attrs.append(
+                    torch_geometric.data.feature_store.TensorAttr(
+                        group_name,
+                        attr_name,
+                    )
+                )
+
+        return attrs
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index f1292d9b4b7..77ecce6187a 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -1,32 +1,50 @@
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from typing import Union, Optional
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import numpy as np
 import cupy
 import cudf
 import pandas
 
+import pylibcugraph
+
+from cugraph.utilities.utils import import_optional, MissingModule
+from cugraph.gnn.comms import cugraph_comms_get_raft_handle
+
+from typing import Union, Optional, List
+
+
 # Have to use import_optional even though these are required
 # dependencies in order to build properly.
 torch_geometric = import_optional('torch_geometric')
 torch = import_optional('torch')
 tensordict = import_optional('tensordict')
 
-GraphStore = object if isinstance(torch_geometric, MissingModule) else torch_geometric.GraphStore
 TensorType = Union['torch.Tensor', cupy.ndarray, np.ndarray, cudf.Series, pandas.Series]
 
-class DistGraphStore(GraphStore):
+class GraphStore(object if isinstance(torch_geometric, MissingModule) else torch_geometric.data.GraphStore):
     """
     This object uses lazy graph creation.  Users can repeatedly call
     put_edge_index, and the tensors won't be converted into a cuGraph
     graph until one is needed (i.e. when creating a loader).
     """
 
-    def __init__(self, ):
-        self._edge_indices = tensordict.TensorDict({}, batch_size=(2,))
-        self._sizes = {}
+    def __init__(self, is_multi_gpu:bool=False):
+        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
+        self.__sizes = {}
         self.__graph = None
+        self.__handle = None
+        self.__is_multi_gpu = is_multi_gpu
 
     def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', edge_attr:'torch_geometric.data.EdgeAttr') ->bool:
         if edge_attr.layout != 'coo':
@@ -41,8 +59,8 @@ def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', ed
         elif isinstance(edge_index, cudf.Series):
             edge_index = torch.as_tensor(edge_index.values, device='cuda')
         
-        self._edge_indices[edge_attr.edge_type] = torch.stack(edge_index)
-        self._sizes[edge_attr.edge_type] = edge_attr.size
+        self.__edge_indices[edge_attr.edge_type] = torch.stack(edge_index)
+        self.__sizes[edge_attr.edge_type] = edge_attr.size
 
         # invalidate the graph
         self.__graph = None
@@ -50,7 +68,7 @@ def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', ed
 
     def _get_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->Optional['torch_geometric.typing.EdgeTensorType']:
         ei = torch_geometric.EdgeIndex(
-            self._edge_indices[edge_attr.edge_type]
+            self.__edge_indices[edge_attr.edge_type]
         )
         
         
@@ -62,7 +80,7 @@ def _get_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->Optional['
         return ei
 
     def _remove_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->bool:
-        del self._edge_indices[edge_attr.edge_type]
+        del self.__edge_indices[edge_attr.edge_type]
         
         # invalidate the graph
         self.__graph = None
@@ -70,14 +88,104 @@ def _remove_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->bool:
 
     def get_all_edge_attrs(self) -> List['torch_geometric.data.EdgeAttr']:
         attrs = []
-        for et in self._edge_indices.keys(leaves_only=True, include_nested=True):
+        for et in self.__edge_indices.keys(leaves_only=True, include_nested=True):
             attrs.append(
                 torch_geometric.data.EdgeAttr(
                     edge_type=et,
                     layout='coo',
                     is_sorted=False,
-                    size=self._sizes[et]
+                    size=self.__sizes[et]
                 )
             )
         
-        return attrs
\ No newline at end of file
+        return attrs
+
+    @property
+    def is_multi_gpu(self):
+        return self.__is_multi_gpu
+
+    @property
+    def _resource_handle(self):
+        if self.__handle is None:
+            if self.is_multi_gpu:
+                self.__handle = pylibcugraph.ResourceHandle(
+                    cugraph_comms_get_raft_handle().getHandle()
+                )
+            else:
+                self.__handle = pylibcugraph.ResourceHandle()
+        return self.__handle
+
+    @property
+    def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
+        graph_properties = pylibcugraph.GraphProperties(
+            is_multigraph=True,
+            is_symmetric=False
+        )
+
+        if self.__graph is None:
+            edgelist_dict = self.__get_edgelist()
+            if self.is_multi_gpu:
+                self.__graph = pylibcugraph.MGGraph(
+                    self._handle,
+                    graph_properties,
+                    [edgelist_dict['src']],
+                    [edgelist_dict['dst']],
+                    edge_id_array=edgelist_dict['eid'],
+                    edge_type_array=edgelist_dict['etp'],
+                )
+            else:
+                self.__graph = pylibcugraph.SGGraph(
+                    self._handle,
+                    graph_properties,
+                    edgelist_dict['src'],
+                    edgelist_dict['dst'],
+                    edge_id_array=edgelist_dict['eid'],
+                    edge_type_array=edgelist_dict['etp'],
+                )
+        
+        return self.__graph
+
+    def __get_edgelist(self):
+        """
+        Returns
+        -------
+        Dict[str, torch.Tensor] with the following keys:
+            src: source vertices (int64)
+                Note that src is the 2nd element of the PyG edge index.
+            dst: destination vertices (int64)
+                Note that dst is the 1st element of the PyG edge index.
+            eid: edge ids for each edge (int64)
+                Note that these start from 0 for each edge type.
+            etp: edge types for each edge (int32)
+                Note that these are in lexicographic order.
+        """
+        sorted_keys = sorted(list(self.__edge_indices.keys(leaves_only=True,include_nested=True)))
+
+        # note that this still follows the PyG convention of (dst, rel, src)
+        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
+        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
+        # and (paper 1) -> (author 0)
+        edge_index = torch.concat([
+            torch.stack([
+                self.__edge_indices[dst_type,rel_type,src_type][0] + self.__get_vertex_offset(dst_type),
+                self.__edge_indices[dst_type,rel_type,src_type][1] + self.__get_vertex_offset(src_type),
+            ]) for (dst_type,rel_type,src_type) in sorted_keys
+        ], axis=1).cuda()
+
+        edge_type_array = torch.arange(len(sorted_keys), dtype='int32').repeat_interleave(torch.tensor([
+            self.__edge_indices[et].shape[1] for et in sorted_keys
+        ])).cuda()
+
+        edge_id_array = torch.concat([
+            torch.arange(self.__edge_indices[et].shape[1], dtype=torch.int64, device='cuda')
+            for et in sorted_keys
+        ])
+
+        return {
+            'dst': edge_index[0],
+            'src': edge_index[1],
+            'etp': edge_type_array,
+            'eid': edge_id_array,
+        }
+
+        
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
index 29a6cc2b464..31cbaf69ca5 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
@@ -95,7 +95,7 @@ def main():
     with tempfile.TemporaryDirectory() as directory:
         tmp.spawn(
             sample,
-            args=(world_size, uid, el, "."),
+            args=(world_size, uid, el, directory),
             nprocs=world_size,
         )
 
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
index 8366ff44233..e4e074ddc77 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
@@ -55,6 +55,8 @@ def sample(edgelist, directory):
         G,
         sample_writer,
         fanout=[5, 5],
+        compression="CSR",
+        retain_original_seeds=True,
     )
 
     sampler.sample_from_nodes(seeds, batch_size=16, random_state=62)
@@ -65,7 +67,7 @@ def main():
     el = dataset[0][0]["edge_index"].astype("int64")
 
     with tempfile.TemporaryDirectory() as directory:
-        sample(el, directory)
+        sample(el, "/home/nfs/abarghi/deleteme/")
 
         print("Printing samples...")
         for file in os.listdir(directory):
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index 2c3d7eff89e..385155aa2dc 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -11,6 +11,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.loader.cugraph_node_loader import CuGraphNeighborLoader
+import warnings
+
+from cugraph_pyg.loader.node_loader import NodeLoader
+
+from cugraph_pyg.loader.cugraph_node_loader import DaskNeighborLoader
 
 from cugraph_pyg.loader.cugraph_node_loader import BulkSampleLoader
+
+def CuGraphNeighborLoader(*args, **kwargs):
+    warnings.warn(
+        "CuGraphNeighborLoader has been renamed to DaskNeighborLoader"
+    )
+    return DaskNeighborLoader(*args, **kwargs)
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
similarity index 97%
rename from python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
rename to python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
index 55c9e9b3329..20a081087cb 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
@@ -23,8 +23,8 @@
 from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import import_optional, MissingModule
 
-from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import (
+from cugraph_pyg.data import DaskGraphStore
+from cugraph_pyg.sampler.sampler_utils import (
     _sampler_output_from_sampling_results_heterogeneous,
     _sampler_output_from_sampling_results_homogeneous_csr,
     _sampler_output_from_sampling_results_homogeneous_coo,
@@ -47,8 +47,8 @@ class BulkSampleLoader:
 
     def __init__(
         self,
-        feature_store: CuGraphStore,
-        graph_store: CuGraphStore,
+        feature_store: DaskGraphStore,
+        graph_store: DaskGraphStore,
         input_nodes: InputNodes = None,
         batch_size: int = 0,
         *,
@@ -72,10 +72,10 @@ def __init__(
 
         Parameters
         ----------
-        feature_store: CuGraphStore
+        feature_store: DaskGraphStore
             The feature store containing features for the graph.
 
-        graph_store: CuGraphStore
+        graph_store: DaskGraphStore
             The graph store containing the graph structure.
 
         input_nodes: InputNodes
@@ -487,10 +487,10 @@ def __iter__(self):
         return self
 
 
-class CuGraphNeighborLoader:
+class DaskNeighborLoader:
     def __init__(
         self,
-        data: Union[CuGraphStore, Tuple[CuGraphStore, CuGraphStore]],
+        data: Union[CuGraphStore, Tuple[DaskGraphStore, DaskGraphStore]],
         input_nodes: Union[InputNodes, int] = None,
         batch_size: int = None,
         **kwargs,
@@ -498,8 +498,8 @@ def __init__(
         """
         Parameters
         ----------
-        data: CuGraphStore or (CuGraphStore, CuGraphStore)
-            The CuGraphStore or stores where the graph/feature data is held.
+        data: DaskGraphStore or (DaskGraphStore, DaskGraphStore)
+            The DaskGraphStore or stores where the graph/feature data is held.
 
         batch_size: int (required)
             The number of input nodes in each batch.
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
new file mode 100644
index 00000000000..9ffa6438a67
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import tempfile
+
+from typing import Union, Tuple, Optional, Callable, List, Dict
+
+import cugraph_pyg
+from cugraph_pyg.loader import NodeLoader
+from cugraph_pyg.sampler import BaseSampler
+
+from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+
+torch_geometric = import_optional('torch_geometric')
+
+class NeighborLoader(NodeLoader):
+    """
+    Node loader that implements the neighbor sampling
+    algorithm used in GraphSAGE.
+
+    Duck-typed version of torch_geometric.loader.NeighborLoader
+    """
+    def __init__(self,
+        data: Union['torch_geometric.data.Data', 'torch_geometric.data.HeteroData', Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore']],
+        num_neighbors:  Union[List[int], Dict['torch_geometric.typing.EdgeType', List[int]]],
+        input_nodes: 'torch_geometric.typing.InputNodes' = None,
+        input_time: 'torch_geometric.typing.OptTensor' = None,
+        replace: bool = False,
+        subgraph_type: Union['torch_geometric.typing.SubgraphType', str] = 'directional',
+        disjoint: bool = False,
+        temporal_strategy: str = 'uniform',
+        time_attr: Optional[str] = None,
+        weight_attr: Optional[str] = None,
+        transform: Optional[Callable] = None,
+        transform_sampler_output: Optional[Callable] = None,
+        is_sorted: bool = False,
+        filter_per_worker: Optional[bool] = None,
+        neighbor_sampler: Optional['torch_geometric.sampler.NeighborSampler'] = None,
+        directed: bool = True,  # Deprecated.
+        directory:str=None,
+        batches_per_partition=256,
+        format:str='parquet',
+        local_seeds_per_call: int=32768,
+        **kwargs,):
+            """
+            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+                See torch_geometric.loader.NeighborLoader.
+            num_neighbors: List[int] or Dict[EdgeType, List[int]]
+                Fanout values.
+                See torch_geometric.loader.NeighborLoader.
+            input_nodes: InputNodes
+                Input nodes for sampling.
+                See torch_geometric.loader.NeighborLoader.
+            input_time: OptTensor (optional)
+                See torch_geometric.loader.NeighborLoader.
+            replace: bool (optional, default=False)
+                Whether to sample with replacement.
+                See torch_geometric.loader.NeighborLoader.
+            subgraph_type: Union[SubgraphType, str] (optional, default='directional')
+                The type of subgraph to return.
+                Currently only 'directional' is supported.
+                See torch_geometric.loader.NeighborLoader.
+            disjoint: bool (optional, default=False)
+                Whether to perform disjoint sampling.
+                Currently unsupported.
+                See torch_geometric.loader.NeighborLoader.
+            temporal_strategy: str (optional, default='uniform')
+                Currently only 'uniform' is suppported.
+                See torch_geometric.loader.NeighborLoader.
+            time_attr: str (optional, default=None)
+                Used for temporal sampling.
+                See torch_geometric.loader.NeighborLoader.
+            weight_attr: str (optional, default=None)
+                Used for biased sampling.
+                See torch_geometric.loader.NeighborLoader.
+            transform: Callable (optional, default=None)
+                See torch_geometric.loader.NeighborLoader.
+            transform_sampler_output: Callable (optional, default=None)
+                See torch_geometric.loader.NeighborLoader.
+            is_sorted: bool (optional, default=False)
+                Ignored by cuGraph.
+                See torch_geometric.loader.NeighborLoader.
+            filter_per_worker: bool (optional, default=False)
+                Currently ignored by cuGraph, but this may
+                change once in-memory sampling is implemented.
+                See torch_geometric.loader.NeighborLoader.
+            neighbor_sampler: torch_geometric.sampler.NeighborSampler (optional, default=None)
+                Not supported by cuGraph.
+                See torch_geometric.loader.NeighborLoader.
+            directed: bool (optional, default=True)
+                Deprecated.
+                See torch_geometric.loader.NeighborLoader.
+            directory: str (optional, default=None)
+                The directory where samples will be temporarily stored.
+                It is recommend that this be set by the user, usually
+                setting it to a tempfile.TemporaryDirectory with a context
+                manager is a good option but depending on the filesystem,
+                you may want to choose an alternative location with fast I/O
+                intead.
+                If not set, this will create a TemporaryDirectory that will
+                persist until this object is garbage collected.
+                See cugraph.gnn.DistSampleWriter.
+            batches_per_partition: int (optional, default=256)
+                The number of batches per partition if writing samples to
+                disk.  Manually tuning this parameter is not recommended
+                but reducing it may help conserve GPU memory.
+                See cugraph.gnn.DistSampleWriter.
+            format: str (optional, default='parquet')
+                If writing samples to disk, they will be written in this
+                file format.
+                See cugraph.gnn.DistSampleWriter.
+            local_seeds_per_call: int (optional, default=32768)
+                The number of seeds to process within a single sampling call.
+                Manually tuning this parameter is not recommended but reducing
+                it may conserve GPU memory.  The total number of seeds processed
+                per sampling call is equal to the sum of this parameter across
+                all workers.
+                See cugraph.gnn.DistSampler.
+            **kwargs
+                Other keyword arguments passed to the superclass.
+            """
+
+            if not directed:
+                subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
+                warnings.warn(
+                     "The 'directed' argument is deprecated. "
+                     "Use subgraph_type='induced' instead."
+                )
+            if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
+                raise ValueError("Only directional subgraphs are currently supported")
+            if disjoint:
+                raise ValueError("Disjoint sampling is currently unsupported")
+            if temporal_strategy != 'uniform':
+                warnings.warn('Only the uniform temporal strategy is currently supported')
+            if neighbor_sampler is not None:
+                raise ValueError("Passing a neighbor sampler is currently unsupported")
+            if time_attr is not None:
+                raise ValueError("Temporal sampling is currently unsupported")
+            if weight_attr is not None:
+                raise ValueError("Biased sampling is currently unsupported")
+            if is_sorted:
+                warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
+
+            if not isinstance(data, Tuple[cugraph_pyg.data.FeatureStore, cugraph_pyg.data.GraphStore]):
+                # Will eventually automatically convert these objects to cuGraph objects.
+                raise NotImplementedError("Currently can't accept non-cugraph graphs")
+
+            if directory is None:
+                warnings.warn("Setting a directory to store samples is recommended.")
+                self._tempdir = tempfile.TemporaryDirectory()
+                directory = self._tempdir.name
+
+            writer = DistSampleWriter(
+                directory=directory,
+                batches_per_partition=batches_per_partition,
+                format=format
+            )
+
+            feature_store, graph_store = data
+            sampler = BaseSampler(
+                UniformNeighborSampler(
+                    graph_store._graph,
+                    writer,
+                    retain_original_seeds=True,
+                    fanout=num_neighbors,
+                    prior_sources_behavior='exclude',
+                    deduplicate_sources=True,
+                    compression="CSR",
+                    compress_per_hop=False,
+                    with_replacement=replace,
+                    local_seeds_per_call=local_seeds_per_call,
+                )
+            )
+
+            super().__init__(
+                (feature_store, graph_store),
+                sampler,
+                input_nodes=input_nodes,
+                input_time=input_time,
+                transform=transform,
+                transform_sampler_output=transform_sampler_output,
+                filter_per_worker=filter_per_worker,
+                **kwargs,
+            )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
new file mode 100644
index 00000000000..746d2c196f8
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import cugraph_pyg
+from typing import Union, Tuple, Callable, Optional
+
+from cugraph.utilities.utils import import_optional
+
+torch_geometric = import_optional('torch_geometric')
+
+class NodeLoader:
+    """
+    Duck-typed version of torch_geometric.loader.NodeLoader
+    """
+
+    def __init__(self,
+        data: Union['torch_geometric.data.Data', 'torch_geometric.data.HeteroData', Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore']],
+        node_sampler: 'cugraph_pyg.sampler.BaseSampler',
+        input_nodes: 'torch_geometric.typing.InputNodes' = None,
+        input_time: 'torch_geometric.typing.OptTensor' = None,
+        transform: Optional[Callable] = None,
+        transform_sampler_output: Optional[Callable] = None,
+        filter_per_worker: Optional[bool] = None,
+        custom_cls: Optional['torch_geometric.data.HeteroData'] = None,
+        input_id: 'torch_geometric.typing.OptTensor' = None,
+        **kwargs,):
+            """
+            Parameters
+            ----------
+                data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+                    See torch_geometric.loader.NodeLoader.
+                node_sampler: BaseSampler
+                    See torch_geometric.loader.NodeLoader.
+                input_nodes: InputNodes
+                    See torch_geometric.loader.NodeLoader.                
+                input_time: OptTensor
+                    See torch_geometric.loader.NodeLoader.
+                transform: Callable (optional, default=None)
+                    This argument currently has no effect.
+                transform_sampler_output: Callable (optional, default=None)
+                    This argument currently has no effect.
+                filter_per_worker: bool (optional, default=False)
+                    This argument currently has no effect.
+                custom_cls: HeteroData
+                    This argument currently has no effect.  This loader will
+                    always return a HeteroData object.
+                input_id: OptTensor
+                    See torch_geometric.loader.NodeLoader.
+
+            """
+            if not isinstance(data, Tuple[cugraph_pyg.data.FeatureStore, cugraph_pyg.data.GraphStore]):
+                # Will eventually automatically convert these objects to cuGraph objects.
+                raise NotImplementedError("Currently can't accept non-cugraph graphs")
+            
+            if not isinstance(node_sampler, cugraph_pyg.sampler.BaseSampler):
+                raise NotImplementedError("Must provide a cuGraph sampler")
+
+            if input_time is not None:
+                raise ValueError("Temporal sampling is currently unsupported")
+            
+            if filter_per_worker:
+                warnings.warn("filter_per_worker is currently ignored")
+            
+            if custom_cls is not None:
+                warnings.warn("custom_cls is currently ignored")
+            
+            if transform is not None:
+                warnings.warn("transform is currently ignored.")
+
+            if transform_sampler_output is not None:
+                warnings.warn("transform_sampler_output is currently ignored.")
+
+            input_type, input_nodes, input_id = torch_geometric.loader.utils.get_input_nodes(
+                data,
+                input_nodes,
+                input_id,
+            )
+
+            self.__input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
+                input_id=input_id,
+                node=input_nodes,
+                time=None,
+                input_type=input_type,
+            )
+
+            self.__node_sampler = node_sampler
+            
+    
+    def __iter__(self):
+        return self.__node_sampler.sample_from_nodes(self.__input_data)
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
index 2ec68a8b4ac..2299207e288 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,3 +10,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from cugraph_pyg.sampler.sampler import BaseSampler
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
new file mode 100644
index 00000000000..5a28bc722eb
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Iterator, Union, Dict
+
+from cugraph.utilities.utils import import_optional
+from cugraph.gnn import DistSampler, DistSamplerReader
+
+torch = import_optional("torch")
+torch_geometric = import_optional('torch_geometric')
+
+class SampleReader:
+    def __init__(self, base_reader: DistSampleReader):
+        self.__base_reader = base_reader
+        self.__num_samples_remaining = 0
+        self.__index = 0
+    
+    def __next__(self):
+        if self.__num_samples_remaining == 0:
+            # raw_sample_data is already a dict of tensors
+            self.__raw_sample_data, start_inclusive, end_inclusive = next(self.__base_reader)
+
+            self.__raw_sample_data['label_hop_offsets'] -= self.__raw_sample_data['label_hop_offsets'][0].clone()
+            self.__raw_sample_data['renumber_map_offsets'] -= self.__raw_sample_data['renumber_map_offsets'][0].clone()
+            if 'major_offsets' in self.__raw_sample_data:
+                self.__raw_sample_data['major_offsets'] -= self.__raw_sample_data['major_offsets'][0].clone()
+
+            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
+            self.__index = 0
+            
+        return self._decode(self.__raw_sample_data, self.__index)
+
+    def __iter__(self):
+        return self
+
+class HomogeneousSampleReader(SampleReader):
+    def __init__(self, base_reader: DistSampleReader):
+        super().__init__(base_reader)
+
+    def __decode_csc(self, raw_sample_data: Dict['torch.Tensor'], index: int):
+        fanout_length = len(raw_sample_data['label_hop_offsets']) - 1 // (len(raw_sample_data['renumber_map_offsets']) - 1)
+        
+        major_offsets_start_incl = raw_sample_data['label_hop_offsets'][index * fanout_length]
+        major_offsets_end_incl = raw_sample_data['label_hop_offsets'][(index + 1) * fanout_length]
+
+        major_offsets = raw_sample_data['major_offsets'][major_offsets_start_incl : major_offsets_end_incl + 1].clone()
+        minors = raw_sample_data['minors'][major_offsets[0] : major_offsets[-1]]
+        edge_id = raw_sample_data['edge_id'][major_offsets[0] : major_offsets[-1]]
+        # don't retrieve edge type for a homogeneous graph
+
+        renumber_map_start = raw_sample_data['renumber_map_offsets'][index]
+        renumber_map_end = raw_sample_data['renumber_map_offsets'][index + 1]
+
+        renumber_map = raw_sample_data['map'][renumber_map_start:renumber_map_end]
+
+        current_label_hop_offsets = raw_sample_data['label_hop_offsets'][index * fanout_length : (index + 1) * fanout_length + 1].clone()
+        current_label_hop_offsets -= current_label_hop_offsets[0].clone()
+
+        num_sampled_edges = major_offsets[current_label_hop_offsets].diff()
+        num_sampled_nodes = torch.concat(
+            [
+                current_label_hop_offsets.diff(),
+                (renumber_map.shape[0] - current_label_hop_offsets[-1]).reshape((1,)),
+            ]
+        )
+
+        return torch_geometric.sampler.SamplerOutput(
+            node=renumber_map,
+            row=minors,
+            col=major_offsets,
+            edge=edge_id,
+            batch=renumber_map[:num_sampled_nodes[0]],
+            num_sampled_nodes=num_sampled_nodes.cpu(),
+            num_sampled_edges=num_sampled_edges.cpu(),
+        )
+
+    def __decode_coo(raw_sample_data: Dict['torch.Tensor'], index: int):
+        fanout_length = len(raw_sample_data['label_hop_offsets']) - 1 // (len(raw_sample_data['renumber_map_offsets']) - 1)
+        
+        major_minor_start = raw_sample_data['label_hop_offsets'][index * fanout_length]
+        major_minor_end = raw_sample_data['label_hop_offsets'][(index + 1) * fanout_length]
+
+        majors = raw_sample_data['majors'][major_minor_start:major_minor_end]
+        minors = raw_sample_data['minors'][major_minor_start:major_minor_end]
+        edge_id = raw_sample_data['edge_id'][major_minor_start:major_minor_end]
+        # don't retrieve edge type for a homogeneous graph
+
+        renumber_map_start = raw_sample_data['renumber_map_offsets'][index]
+        renumber_map_end = raw_sample_data['renumber_map_offsets'][index + 1]
+
+        renumber_map = raw_sample_data['map'][renumber_map_start:renumber_map_end]
+
+        num_sampled_edges = raw_sample_data['label_hop_offsets'][index * fanout_length : (index + 1) * fanout_length + 1].diff().cpu()
+
+        return torch_geometric.sampler.SamplerOutput(
+            node=renumber_map,
+            row=minors,
+            col=majors,
+            edge=edge_id,
+            batch=None,
+            num_sampled_nodes=None,
+            num_sampled_edges=num_sampled_edges,
+        )
+
+    def _decode(self, raw_sample_data: Dict['torch.Tensor'], index: int):
+        if 'major_offsets' in raw_sample_data:
+            return self.__decode_csc(raw_sample_data, index)
+        else:
+            return self.__decode_coo(raw_sample_data, index)
+
+class BaseSampler:
+    def __init__(self, sampler: DistSampler):
+        self.__sampler = sampler
+
+    def sample_from_nodes(self, index: 'torch_geometric.sampler.NodeSamplerInput', **kwargs) -> Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]:
+        self.__sampler.sample_from_nodes(
+            index.node,
+            **kwargs
+        )
+
+        return SampleReader(
+            self.__sampler.get_reader()
+        )
+
+    def sample_from_edges(self, index: 'torch_geometric.sampler.EdgeSamplerInput', neg_sampling: Optional['torch_geometric.sampler.NegativeSampling'], **kwargs) -> Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]:
+        raise NotImplementedError("Edge sampling is currently unimplemented.")
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
similarity index 100%
rename from python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
rename to python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
diff --git a/python/cugraph/cugraph/gnn/__init__.py b/python/cugraph/cugraph/gnn/__init__.py
index 1f4d98f0230..b6c8e1981d0 100644
--- a/python/cugraph/cugraph/gnn/__init__.py
+++ b/python/cugraph/cugraph/gnn/__init__.py
@@ -16,6 +16,7 @@
 from .data_loading.dist_sampler import (
     DistSampler,
     DistSampleWriter,
+    DistSampleReader,
     UniformNeighborSampler,
 )
 from .comms.cugraph_nccl_comms import (
diff --git a/python/cugraph/cugraph/gnn/data_loading/__init__.py b/python/cugraph/cugraph/gnn/data_loading/__init__.py
index a50f6085e9a..98c547a0083 100644
--- a/python/cugraph/cugraph/gnn/data_loading/__init__.py
+++ b/python/cugraph/cugraph/gnn/data_loading/__init__.py
@@ -15,5 +15,6 @@
 from cugraph.gnn.data_loading.dist_sampler import (
     DistSampler,
     DistSampleWriter,
+    DistSampleReader,
     UniformNeighborSampler,
 )
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index e57e195a4b8..902eb73933e 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 import os
+import re
 import warnings
 from math import ceil
 
@@ -20,7 +21,8 @@
 import cupy
 import cudf
 
-from typing import Union, List, Dict, Tuple
+from typing import Union, List, Dict, Tuple, Iterator
+
 from cugraph.utilities import import_optional
 from cugraph.gnn.comms import cugraph_comms_get_raft_handle
 
@@ -32,6 +34,43 @@
 TensorType = Union["torch.Tensor", cupy.ndarray, cudf.Series]
 
 
+class DistSampleReader:
+    def __init__(self, directory:str, *, format: str = "parquet", rank:int = 0):
+        self.__format = format
+        self.__directory = directory
+
+        if format != "parquet":
+            raise ValueError("Invalid format (currently supported: 'parquet')")
+        
+        files = os.listdir(directory)
+        ex = re.compile(r'batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet')
+        filematch = [ex.match(f) for f in files]
+        filematch = [f for f in filematch if f]
+        filematch = [f for f in filematch if int(f[1]) == rank]
+        filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)
+        
+        self.__files = filematch
+    
+    def __iter__(self):
+        return self
+    
+    def __next__(self):
+        if len(self.__files) > 0:
+            f = self.__files.pop()
+            fname = f[0]
+            start_inclusive = int(f[2])
+            end_inclusive = int(f[4])
+
+            df = cudf.read_parquet(os.path.join(self.__directory, fname))
+            tensors = {}
+            for col in list(df.columns):
+                tensors[col] = torch.as_tensor(df[col].dropna(), device='cuda')
+                df.drop(col, axis=1, inplace=True)
+            
+            return tensors, start_inclusive, end_inclusive
+
+        raise StopIteration
+
 class DistSampleWriter:
     def __init__(
         self,
@@ -71,6 +110,14 @@ def _directory(self):
     @property
     def _batches_per_partition(self):
         return self.__batches_per_partition
+    
+    def get_reader(self, rank: int) -> Iterator[Tuple[Dict['torch.Tensor'], int, int]]:
+        """
+        Returns an iterator over sampled data.
+        """
+
+        # currently only disk reading is supported
+        return DistSampleReader(self._directory, format=self._format, rank=rank)
 
     def __write_minibatches_coo(self, minibatch_dict):
         has_edge_ids = minibatch_dict["edge_id"] is not None
@@ -166,9 +213,102 @@ def __write_minibatches_coo(self, minibatch_dict):
             )
 
     def __write_minibatches_csr(self, minibatch_dict):
-        raise NotImplementedError(
-            "CSR format currently not supported for distributed sampling"
+        has_edge_ids = minibatch_dict["edge_id"] is not None
+        has_edge_types = minibatch_dict["edge_type"] is not None
+        has_weights = minibatch_dict["weight"] is not None
+
+        if minibatch_dict["renumber_map"] is None:
+            raise ValueError(
+                "Distributed sampling without renumbering is not supported"
+            )
+
+        # Quit if there are no batches to write.
+        if len(minibatch_dict["batch_id"]) == 0:
+            return
+
+        fanout_length = (len(minibatch_dict["label_hop_offsets"]) - 1) // len(
+            minibatch_dict["batch_id"]
         )
+        rank_batch_offset = minibatch_dict["batch_id"][0]
+
+        for p in range(
+            0, int(ceil(len(minibatch_dict["batch_id"]) / self.__batches_per_partition))
+        ):
+            partition_start = p * (self.__batches_per_partition)
+            partition_end = (p + 1) * (self.__batches_per_partition)
+
+            label_hop_offsets_array_p = minibatch_dict["label_hop_offsets"][
+                partition_start * fanout_length : partition_end * fanout_length + 1
+            ]
+
+            batch_id_array_p = minibatch_dict["batch_id"][partition_start:partition_end]
+            start_batch_id = batch_id_array_p[0] - rank_batch_offset
+
+            # major offsets and minors
+            major_offsets_start_incl, major_offsets_end_incl = label_hop_offsets_array_p[[0, -1]]
+
+            start_ix,end_ix = minibatch_dict['major_offsets'][[major_offsets_start_incl, major_offsets_end_incl]]
+
+            major_offsets_array_p = minibatch_dict["major_offsets"][major_offsets_start_incl : major_offsets_end_incl + 1]
+
+            minors_array_p = minibatch_dict["minors"][start_ix:end_ix]
+            edge_id_array_p = (
+                minibatch_dict["edge_id"][start_ix:end_ix]
+                if has_edge_ids
+                else cupy.array([], dtype="int64")
+            )
+            edge_type_array_p = (
+                minibatch_dict["edge_type"][start_ix:end_ix]
+                if has_edge_types
+                else cupy.array([], dtype="int32")
+            )
+            weight_array_p = (
+                minibatch_dict["weight"][start_ix:end_ix]
+                if has_weights
+                else cupy.array([], dtype="float32")
+            )
+
+            # create the renumber map offsets
+            renumber_map_offsets_array_p = minibatch_dict["renumber_map_offsets"][
+                partition_start : partition_end + 1
+            ]
+
+            renumber_map_start_ix, renumber_map_end_ix = renumber_map_offsets_array_p[
+                [0, -1]
+            ]
+
+            renumber_map_array_p = minibatch_dict["renumber_map"][
+                renumber_map_start_ix:renumber_map_end_ix
+            ]
+
+            results_dataframe_p = create_df_from_disjoint_arrays(
+                {
+                    "major_offsets": major_offsets_array_p,
+                    "minors": minors_array_p,
+                    "map": renumber_map_array_p,
+                    "label_hop_offsets": label_hop_offsets_array_p,
+                    "weight": weight_array_p,
+                    "edge_id": edge_id_array_p,
+                    "edge_type": edge_type_array_p,
+                    "renumber_map_offsets": renumber_map_offsets_array_p,
+                }
+            )
+
+            end_batch_id = start_batch_id + len(batch_id_array_p) - 1
+            rank = minibatch_dict["rank"] if "rank" in minibatch_dict else 0
+
+            full_output_path = os.path.join(
+                self.__directory,
+                f"batch={rank:05d}.{start_batch_id:08d}-"
+                f"{rank:05d}.{end_batch_id:08d}.parquet",
+            )
+
+            results_dataframe_p.to_parquet(
+                full_output_path,
+                compression=None,
+                index=False,
+                force_nullable_schema=True,
+            )
 
     def write_minibatches(self, minibatch_dict):
         if (minibatch_dict["majors"] is not None) and (
@@ -219,6 +359,13 @@ def __init__(
         self.__handle = None
         self.__retain_original_seeds = retain_original_seeds
 
+    def get_reader(self) -> Iterator[Tuple[Dict['torch.Tensor'], int, int]]:
+        """
+        Returns an iterator over sampled data.
+        """
+        rank = torch.distributed.get_rank() if self.is_multi_gpu else 0
+        return self.__writer.get_reader(rank)
+
     def sample_batches(
         self,
         seeds: TensorType,
@@ -438,13 +585,6 @@ def sample_from_nodes(
                 : len(current_seeds)
             ]
 
-            # Handle the case where not all ranks have the same number of call groups,
-            # in which case there will be some empty groups that get submitted on the
-            # ranks with fewer call groups.
-            label_start, label_end = (
-                current_batches[[0, -1]] if len(current_batches) > 0 else (0, -1)
-            )
-
             minibatch_dict = self.sample_batches(
                 seeds=current_seeds,
                 batch_ids=current_batches,

From 6a7be0648b4552735a90aebe0ca54a636463ed6b Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 1 May 2024 13:23:07 -0700
Subject: [PATCH 34/80] iterating over samples

---
 .../cugraph_pyg/loader/node_loader.py         |  6 +-
 .../cugraph_pyg/sampler/__init__.py           |  2 +-
 .../cugraph_pyg/sampler/sampler.py            | 86 +++++++++++++++++--
 .../cugraph_pyg/sampler/sampler_utils.py      | 37 --------
 4 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
index 746d2c196f8..811330d4dae 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -55,7 +55,7 @@ def __init__(self,
                     This argument currently has no effect.
                 custom_cls: HeteroData
                     This argument currently has no effect.  This loader will
-                    always return a HeteroData object.
+                    always return a Data or HeteroData object.
                 input_id: OptTensor
                     See torch_geometric.loader.NodeLoader.
 
@@ -99,4 +99,6 @@ def __init__(self,
             
     
     def __iter__(self):
-        return self.__node_sampler.sample_from_nodes(self.__input_data)
\ No newline at end of file
+        return cugraph_pyg.sampling.SampleIterator(
+            self.__node_sampler.sample_from_nodes(self.__input_data)
+        )
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
index 2299207e288..13322c72e83 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.sampler.sampler import BaseSampler
\ No newline at end of file
+from cugraph_pyg.sampler.sampler import BaseSampler, SampleIterator
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
index 5a28bc722eb..f0ff56d0f89 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -11,14 +11,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Iterator, Union, Dict
+from typing import Optional, Iterator, Union, Dict, Tuple
 
 from cugraph.utilities.utils import import_optional
-from cugraph.gnn import DistSampler, DistSamplerReader
+from cugraph.gnn import DistSampler, DistSampleReader
 
 torch = import_optional("torch")
 torch_geometric = import_optional('torch_geometric')
 
+class SampleIterator:
+    def __init__(self, data: Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore'], output_iter:Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]):
+        self.__feature_store, self.__graph_store = data
+        self.__output_iter = output_iter
+
+    def __next__(self):
+        next_sample = next(self.__output_iter)
+        if isinstance(next_sample, 'torch_geometric.sampler.SamplerOutput'):
+            sz = next_sample.edge.numel()
+            if sz == next_sample.col.numel():
+                col = next_sample.col
+            else:
+                col = torch_geometric.edge_index.ptr2index(next_sample.col, next_sample.edge.numel())
+            
+            data = torch_geometric.data.utils.filter_custom_store(
+                self.__feature_store,
+                self.__graph_store,
+                next_sample.node,
+                next_sample.row,
+                col,
+                next_sample.edge,
+                None,
+            )
+
+            if 'n_id' not in data:
+                data.n_id = next_sample.node
+            if next_sample.edge is not None and 'e_id' not in data:
+                edge = next_sample.edge.to(torch.long)
+                perm = self.node_sampler.edge_permutation
+                data.e_id = perm[edge] if perm is not None else edge
+
+            data.batch = next_sample.batch
+            data.num_sampled_nodes = next_sample.num_sampled_nodes
+            data.num_sampled_edges = next_sample.num_sampled_edges
+
+            data.input_id = next_sample.metadata[0]
+            data.seed_time = next_sample.metadata[1]
+            data.batch_size = next_sample.metadata[0].size(0)
+
+        elif isinstance(next_sample, 'torch_geometric.sampler.HeteroSamplerOutput'):
+            col = {}
+            for edge_type, col_idx in next_sample.col:
+                sz = next_sample.edge[edge_type].numel()
+                if sz == col_idx.numel():
+                    col[edge_type] = col_idx
+                else:
+                    col[edge_type] = torch_geometric.edge_index.ptr2index(col_idx, sz)
+                
+            data = torch_geometric.data.utils.filter_custom_hetero_store(
+                self.__feature_store,
+                self.__graph_store,
+                next_sample.node,
+                next_sample.row,
+                col,
+                next_sample.edge,
+                None,
+            )
+        else:
+            raise ValueError("Invalid output type")
+
+
+    def __iter__(self):
+        return self
+        
+
+
 class SampleReader:
     def __init__(self, base_reader: DistSampleReader):
         self.__base_reader = base_reader
@@ -119,8 +185,9 @@ def _decode(self, raw_sample_data: Dict['torch.Tensor'], index: int):
             return self.__decode_coo(raw_sample_data, index)
 
 class BaseSampler:
-    def __init__(self, sampler: DistSampler):
+    def __init__(self, sampler: DistSampler, data: Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore']):
         self.__sampler = sampler
+        self.__feature_store, self.__graph_store = data
 
     def sample_from_nodes(self, index: 'torch_geometric.sampler.NodeSamplerInput', **kwargs) -> Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]:
         self.__sampler.sample_from_nodes(
@@ -128,9 +195,16 @@ def sample_from_nodes(self, index: 'torch_geometric.sampler.NodeSamplerInput', *
             **kwargs
         )
 
-        return SampleReader(
-            self.__sampler.get_reader()
-        )
+        edge_attrs = self.__graph_store.get_all_edge_attrs()
+        if len(edge_attrs) == 1 and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]:
+            return HomogeneousSampleReader(
+                self.__sampler.get_reader()
+            )
+        else:
+            # TODO implement heterogeneous sampling
+            raise NotImplementedError(
+                "Sampling heterogeneous graphs is currently unsupported in the non-dask API"
+            )
 
     def sample_from_edges(self, index: 'torch_geometric.sampler.EdgeSamplerInput', neg_sampling: Optional['torch_geometric.sampler.NegativeSampling'], **kwargs) -> Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]:
         raise NotImplementedError("Edge sampling is currently unimplemented.")
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
index ffab54efe08..599dea262db 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
@@ -402,40 +402,3 @@ def _sampler_output_from_sampling_results_heterogeneous(
         metadata=metadata,
     )
 
-
-def filter_cugraph_store_csc(
-    feature_store: torch_geometric.data.FeatureStore,
-    graph_store: torch_geometric.data.GraphStore,
-    node_dict: Dict[str, torch.Tensor],
-    row_dict: Dict[str, torch.Tensor],
-    col_dict: Dict[str, torch.Tensor],
-    edge_dict: Dict[str, Tuple[torch.Tensor]],
-) -> torch_geometric.data.HeteroData:
-    """
-    Deprecated
-    """
-
-    data = torch_geometric.data.HeteroData()
-
-    for attr in graph_store.get_all_edge_attrs():
-        key = attr.edge_type
-        if key in row_dict and key in col_dict:
-            data.put_edge_index(
-                (row_dict[key], col_dict[key]),
-                edge_type=key,
-                layout="csc",
-                is_sorted=True,
-            )
-
-    required_attrs = []
-    for attr in feature_store.get_all_tensor_attrs():
-        if attr.group_name in node_dict:
-            attr.index = node_dict[attr.group_name]
-            required_attrs.append(attr)
-            data[attr.group_name].num_nodes = attr.index.size(0)
-
-    tensors = feature_store.multi_get_tensor(required_attrs)
-    for i, attr in enumerate(required_attrs):
-        data[attr.group_name][attr.attr_name] = tensors[i]
-
-    return data

From fe4d8b4a59edc5c225491bdaf72e6fc6826aa11b Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 1 May 2024 14:17:11 -0700
Subject: [PATCH 35/80] initial testing

---
 .../cugraph-pyg/cugraph_pyg/data/__init__.py  |  1 +
 .../cugraph_pyg/data/feature_store.py         |  4 +-
 .../cugraph_pyg/data/graph_store.py           | 10 +++-
 .../cugraph_pyg/examples/products_dist_sg.py  | 46 +++++++++++++++++++
 .../cugraph_pyg/loader/__init__.py            |  5 +-
 .../cugraph_pyg/loader/dask_node_loader.py    |  2 +-
 .../cugraph_pyg/loader/neighbor_loader.py     |  1 +
 .../cugraph_pyg/sampler/sampler.py            |  6 +--
 .../cugraph/gnn/data_loading/dist_sampler.py  |  4 +-
 9 files changed, 67 insertions(+), 12 deletions(-)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py

diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
index 73679ea056a..fd87e9da2c0 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -15,6 +15,7 @@
 
 from cugraph_pyg.data.dask_graph_store import DaskGraphStore
 from cugraph_pyg.data.graph_store import GraphStore
+from cugraph_pyg.data.feature_store import TensorDictFeatureStore
 
 def CuGraphStore(*args, **kwargs):
     warnings.warn(
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index d939ef51dbd..56bde698582 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -37,7 +37,7 @@ def __init__(self):
     def _put_tensor(self, tensor:'torch_geometric.typing.FeatureTensorType', attr: 'torch_geometric.data.feature_store.TensorAttr') ->bool:
         if attr.group_name in self.__features:
             td = self.__features[attr.group_name]
-            batch_size = td.batch_size
+            batch_size = td.batch_size[0]
 
             if attr.is_set('index'):
                 if attr.attr_name in td.keys():
@@ -67,7 +67,7 @@ def _get_tensor(self, attr: 'torch_geometric.data.feature_store.TensorAttr') ->
         if attr.group_name not in self.__features:
             return None
     
-        if attr.attr_name not in self.__features[attr.group_name]:
+        if attr.attr_name not in self.__features[attr.group_name].keys():
             return None
     
         tensor = self.__features[attr.group_name][attr.attr_name]
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 77ecce6187a..cc54a3b3e2c 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -46,8 +46,10 @@ def __init__(self, is_multi_gpu:bool=False):
         self.__handle = None
         self.__is_multi_gpu = is_multi_gpu
 
+        super().__init__()
+
     def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', edge_attr:'torch_geometric.data.EdgeAttr') ->bool:
-        if edge_attr.layout != 'coo':
+        if edge_attr.layout != torch_geometric.data.graph_store.EdgeLayout.COO:
             raise ValueError("Only COO format supported")
 
         if isinstance(edge_index, (cupy.ndarray, cudf.Series)):
@@ -59,7 +61,7 @@ def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', ed
         elif isinstance(edge_index, cudf.Series):
             edge_index = torch.as_tensor(edge_index.values, device='cuda')
         
-        self.__edge_indices[edge_attr.edge_type] = torch.stack(edge_index)
+        self.__edge_indices[edge_attr.edge_type] = torch.stack([edge_index[0], edge_index[1]])
         self.__sizes[edge_attr.edge_type] = edge_attr.size
 
         # invalidate the graph
@@ -145,6 +147,10 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
         
         return self.__graph
 
+    def __get_vertex_offset(vertex_type: str):
+        # write this
+        pass
+
     def __get_edgelist(self):
         """
         Returns
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py
new file mode 100644
index 00000000000..24dc77cebc7
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py
@@ -0,0 +1,46 @@
+import torch
+import cupy
+
+import rmm
+from rmm.allocators.cupy import rmm_cupy_allocator
+from rmm.allocators.torch import rmm_torch_allocator
+
+# Must change allocators immediately upon import
+# or else other imports will cause memory to be
+# allocated and prevent changing the allocator
+rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
+cupy.cuda.set_allocator(rmm_cupy_allocator)
+torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+import torch_geometric
+import cugraph_pyg
+from cugraph_pyg.loader import NeighborLoader
+
+# Enable cudf spilling to save gpu memory
+from cugraph.testing.mg_utils import enable_spilling
+enable_spilling()
+
+# Model parameters
+HIDDEN_CHANNELS = 256
+NUM_LAYERS = 2
+LR = 0.001
+NUM_EPOCHS=4
+BATCH_SIZE=1024
+FANOUT = 30
+
+device = torch.device('cuda')
+
+from ogb.nodeproppred import PygNodePropPredDataset
+dataset = PygNodePropPredDataset(name='ogbn-products',
+                                 root='/datasets/ogb_datasets') # FIXME remove this
+split_idx = dataset.get_idx_split()
+data = dataset[0]
+
+graph_store = cugraph_pyg.data.GraphStore()
+graph_store[('paper','cites','paper'), 'coo'] = data.edge_index
+
+feature_store = cugraph_pyg.data.TensorDictFeatureStore()
+feature_store['paper', 'x'] = data.x
+feature_store['paper', 'y'] = data.y
+
+print(graph_store._graph)
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index 385155aa2dc..c17aa97b49a 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -14,10 +14,11 @@
 import warnings
 
 from cugraph_pyg.loader.node_loader import NodeLoader
+from cugraph_pyg.loader.neighbor_loader import NeighborLoader
 
-from cugraph_pyg.loader.cugraph_node_loader import DaskNeighborLoader
+from cugraph_pyg.loader.dask_node_loader import DaskNeighborLoader
 
-from cugraph_pyg.loader.cugraph_node_loader import BulkSampleLoader
+from cugraph_pyg.loader.dask_node_loader import BulkSampleLoader
 
 def CuGraphNeighborLoader(*args, **kwargs):
     warnings.warn(
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
index 20a081087cb..aaf82dd46bb 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
@@ -490,7 +490,7 @@ def __iter__(self):
 class DaskNeighborLoader:
     def __init__(
         self,
-        data: Union[CuGraphStore, Tuple[DaskGraphStore, DaskGraphStore]],
+        data: Union[DaskGraphStore, Tuple[DaskGraphStore, DaskGraphStore]],
         input_nodes: Union[InputNodes, int] = None,
         batch_size: int = None,
         **kwargs,
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
index 9ffa6438a67..3223df31ddf 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -21,6 +21,7 @@
 from cugraph_pyg.sampler import BaseSampler
 
 from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+from cugraph.utilities.utils import import_optional
 
 torch_geometric = import_optional('torch_geometric')
 
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
index f0ff56d0f89..212910fa4fb 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -113,7 +113,7 @@ class HomogeneousSampleReader(SampleReader):
     def __init__(self, base_reader: DistSampleReader):
         super().__init__(base_reader)
 
-    def __decode_csc(self, raw_sample_data: Dict['torch.Tensor'], index: int):
+    def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
         fanout_length = len(raw_sample_data['label_hop_offsets']) - 1 // (len(raw_sample_data['renumber_map_offsets']) - 1)
         
         major_offsets_start_incl = raw_sample_data['label_hop_offsets'][index * fanout_length]
@@ -150,7 +150,7 @@ def __decode_csc(self, raw_sample_data: Dict['torch.Tensor'], index: int):
             num_sampled_edges=num_sampled_edges.cpu(),
         )
 
-    def __decode_coo(raw_sample_data: Dict['torch.Tensor'], index: int):
+    def __decode_coo(raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
         fanout_length = len(raw_sample_data['label_hop_offsets']) - 1 // (len(raw_sample_data['renumber_map_offsets']) - 1)
         
         major_minor_start = raw_sample_data['label_hop_offsets'][index * fanout_length]
@@ -178,7 +178,7 @@ def __decode_coo(raw_sample_data: Dict['torch.Tensor'], index: int):
             num_sampled_edges=num_sampled_edges,
         )
 
-    def _decode(self, raw_sample_data: Dict['torch.Tensor'], index: int):
+    def _decode(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
         if 'major_offsets' in raw_sample_data:
             return self.__decode_csc(raw_sample_data, index)
         else:
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index 902eb73933e..1f864d02ae4 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -111,7 +111,7 @@ def _directory(self):
     def _batches_per_partition(self):
         return self.__batches_per_partition
     
-    def get_reader(self, rank: int) -> Iterator[Tuple[Dict['torch.Tensor'], int, int]]:
+    def get_reader(self, rank: int) -> Iterator[Tuple[Dict[str, 'torch.Tensor'], int, int]]:
         """
         Returns an iterator over sampled data.
         """
@@ -359,7 +359,7 @@ def __init__(
         self.__handle = None
         self.__retain_original_seeds = retain_original_seeds
 
-    def get_reader(self) -> Iterator[Tuple[Dict['torch.Tensor'], int, int]]:
+    def get_reader(self) -> Iterator[Tuple[Dict[str, 'torch.Tensor'], int, int]]:
         """
         Returns an iterator over sampled data.
         """

From 6050ad32c5ead6d8cd9e082abfdd934a703ded75 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 3 May 2024 10:41:00 -0700
Subject: [PATCH 36/80] dist sampling

---
 .../cugraph_pyg/data/feature_store.py         |   2 +-
 .../cugraph_pyg/data/graph_store.py           |  61 ++++---
 .../cugraph_pyg/examples/products_dist_sg.py  | 151 ++++++++++++++++--
 .../cugraph_pyg/loader/neighbor_loader.py     |  14 +-
 .../cugraph_pyg/loader/node_loader.py         |   7 +-
 .../cugraph_pyg/sampler/sampler.py            |  63 +++++---
 .../cugraph/gnn/data_loading/dist_sampler.py  |  33 +++-
 7 files changed, 264 insertions(+), 67 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index 56bde698582..7f6543580ff 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -86,7 +86,7 @@ def _remove_tensor(self, attr: 'torch_geometric.data.feature_store.TensorAttr')
     def _get_tensor_size(self, attr: 'torch_geometric.data.feature_store.TensorAttr') -> Tuple:
         return self._get_tensor(attr).size()
 
-    def get_all_tensor_attr(self) -> List['torch_geometric.data.feature_store.TensorAttr']:
+    def get_all_tensor_attrs(self) -> List['torch_geometric.data.feature_store.TensorAttr']:
         attrs = []
         for group_name, td in self.__features.items():
             for attr_name in td.keys():
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index cc54a3b3e2c..37f0a3f4424 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -21,7 +21,7 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 from cugraph.gnn.comms import cugraph_comms_get_raft_handle
 
-from typing import Union, Optional, List
+from typing import Union, Optional, List, Dict
 
 
 # Have to use import_optional even though these are required
@@ -43,6 +43,7 @@ def __init__(self, is_multi_gpu:bool=False):
         self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
         self.__sizes = {}
         self.__graph = None
+        self.__vertex_offsets = None
         self.__handle = None
         self.__is_multi_gpu = is_multi_gpu
 
@@ -66,6 +67,7 @@ def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', ed
 
         # invalidate the graph
         self.__graph = None
+        self.__vertex_offsets = None
         return True
 
     def _get_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->Optional['torch_geometric.typing.EdgeTensorType']:
@@ -128,28 +130,51 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
             edgelist_dict = self.__get_edgelist()
             if self.is_multi_gpu:
                 self.__graph = pylibcugraph.MGGraph(
-                    self._handle,
+                    self._resource_handle,
                     graph_properties,
-                    [edgelist_dict['src']],
-                    [edgelist_dict['dst']],
-                    edge_id_array=edgelist_dict['eid'],
-                    edge_type_array=edgelist_dict['etp'],
+                    [cupy.asarray(edgelist_dict['src'])],
+                    [cupy.asarray(edgelist_dict['dst'])],
+                    edge_id_array=cupy.asarray(edgelist_dict['eid']),
+                    edge_type_array=cupy.asarray(edgelist_dict['etp']),
                 )
             else:
                 self.__graph = pylibcugraph.SGGraph(
-                    self._handle,
+                    self._resource_handle,
                     graph_properties,
-                    edgelist_dict['src'],
-                    edgelist_dict['dst'],
-                    edge_id_array=edgelist_dict['eid'],
-                    edge_type_array=edgelist_dict['etp'],
+                    cupy.asarray(edgelist_dict['src']),
+                    cupy.asarray(edgelist_dict['dst']),
+                    edge_id_array=cupy.asarray(edgelist_dict['eid']),
+                    edge_type_array=cupy.asarray(edgelist_dict['etp']),
                 )
         
         return self.__graph
 
-    def __get_vertex_offset(vertex_type: str):
-        # write this
-        pass
+    @property
+    def _vertex_offsets(self) -> Dict[str, int]:
+        if self.__vertex_offsets is None:
+            num_vertices = {}
+            for edge_attr in self.get_all_edge_attrs():
+                if edge_attr.size is not None:
+                    num_vertices[edge_attr.edge_type[0]] = max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0]) if edge_attr.edge_type[0] in num_vertices else edge_attr.size[0]
+                    num_vertices[edge_attr.edge_type[2]] = max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1]) if edge_attr.edge_type[2] in num_vertices else edge_attr.size[1]
+                else:
+                    if edge_attr.edge_type[0] not in num_vertices:
+                        num_vertices[edge_attr.edge_type[0]] = self.__edge_indices[edge_attr.edge_type][0].max() + 1
+                    if edge_attr.edge_type[2] not in num_vertices:
+                        num_vertices[edge_attr.edge_type[1]] = self.__edge_indices[edge_attr.edge_type][1].max() + 1
+            
+            ordered_keys = sorted(list(num_vertices.keys()))
+            self.__vertex_offsets = {}
+            offset = 0
+            for vtype in ordered_keys:
+                self.__vertex_offsets[vtype] = offset
+                offset += num_vertices[vtype]
+
+        return dict(self.__vertex_offsets)
+
+    @property
+    def is_homogeneous(self) -> bool:
+        return len(self._vertex_offsets) == 1
 
     def __get_edgelist(self):
         """
@@ -173,14 +198,14 @@ def __get_edgelist(self):
         # and (paper 1) -> (author 0)
         edge_index = torch.concat([
             torch.stack([
-                self.__edge_indices[dst_type,rel_type,src_type][0] + self.__get_vertex_offset(dst_type),
-                self.__edge_indices[dst_type,rel_type,src_type][1] + self.__get_vertex_offset(src_type),
+                self.__edge_indices[dst_type,rel_type,src_type][0] + self._vertex_offsets[dst_type],
+                self.__edge_indices[dst_type,rel_type,src_type][1] + self._vertex_offsets[src_type],
             ]) for (dst_type,rel_type,src_type) in sorted_keys
         ], axis=1).cuda()
 
-        edge_type_array = torch.arange(len(sorted_keys), dtype='int32').repeat_interleave(torch.tensor([
+        edge_type_array = torch.arange(len(sorted_keys), dtype=torch.int32, device='cuda').repeat_interleave(torch.tensor([
             self.__edge_indices[et].shape[1] for et in sorted_keys
-        ])).cuda()
+        ], device='cuda', dtype=torch.int32))
 
         edge_id_array = torch.concat([
             torch.arange(self.__edge_indices[et].shape[1], dtype=torch.int64, device='cuda')
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py
index 24dc77cebc7..3bf45ba9da1 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py
@@ -1,3 +1,10 @@
+import time
+import argparse
+import tempfile
+import os
+
+from typing import Optional
+
 import torch
 import cupy
 
@@ -8,10 +15,11 @@
 # Must change allocators immediately upon import
 # or else other imports will cause memory to be
 # allocated and prevent changing the allocator
-rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
+rmm.reinitialize(devices=[0], pool_allocator=False, managed_memory=True)
 cupy.cuda.set_allocator(rmm_cupy_allocator)
 torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
+import torch.nn.functional as F
 import torch_geometric
 import cugraph_pyg
 from cugraph_pyg.loader import NeighborLoader
@@ -20,27 +28,142 @@
 from cugraph.testing.mg_utils import enable_spilling
 enable_spilling()
 
-# Model parameters
-HIDDEN_CHANNELS = 256
-NUM_LAYERS = 2
-LR = 0.001
-NUM_EPOCHS=4
-BATCH_SIZE=1024
-FANOUT = 30
+parser = argparse.ArgumentParser()
+parser.add_argument('--hidden_channels', type=int, default=256)
+parser.add_argument('--num_layers', type=int, default=2)
+parser.add_argument('--lr', type=float, default=0.001)
+parser.add_argument('--epochs', type=int, default=4)
+parser.add_argument('--batch_size', type=int, default=1024)
+parser.add_argument('--fan_out', type=int, default=30)
+parser.add_argument('--tempdir_root', type=str, default=None)
+parser.add_argument('--dataset_root', type=str, default='dataset')
+parser.add_argument('--dataset', type=str, default='ogbn-products')
+
+args = parser.parse_args()
 
+wall_clock_start = time.perf_counter()
 device = torch.device('cuda')
 
 from ogb.nodeproppred import PygNodePropPredDataset
-dataset = PygNodePropPredDataset(name='ogbn-products',
-                                 root='/datasets/ogb_datasets') # FIXME remove this
+dataset = PygNodePropPredDataset(name=args.dataset,
+                                 root=args.dataset_root)
 split_idx = dataset.get_idx_split()
 data = dataset[0]
 
 graph_store = cugraph_pyg.data.GraphStore()
-graph_store[('paper','cites','paper'), 'coo'] = data.edge_index
+graph_store[('node','rel','node'), 'coo', False, (data.num_nodes, data.num_nodes)] = data.edge_index
 
 feature_store = cugraph_pyg.data.TensorDictFeatureStore()
-feature_store['paper', 'x'] = data.x
-feature_store['paper', 'y'] = data.y
+feature_store['node', 'x'] = data.x
+feature_store['node', 'y'] = data.y
+
+with tempfile.TemporaryDirectory(dir=args.tempdir_root) as samples_dir:
+    train_dir = os.path.join(samples_dir, 'train')
+    os.mkdir(train_dir)
+    train_loader = NeighborLoader(
+        data=(feature_store, graph_store),
+        num_neighbors=[args.fan_out]*args.num_layers,
+        input_nodes=split_idx['train'],
+        replace=False,
+        batch_size=args.batch_size,
+        directory=train_dir,
+    )
+
+    val_dir = os.path.join(samples_dir, 'val')
+    os.mkdir(val_dir)
+    val_loader = NeighborLoader(
+        data=(feature_store, graph_store),
+        num_neighbors=[args.fan_out]*args.num_layers,
+        input_nodes=split_idx['valid'],
+        replace=False,
+        batch_size=args.batch_size,
+        directory=val_dir,
+    )
+
+    test_dir = os.path.join(samples_dir, 'test')
+    os.mkdir(test_dir)
+    test_loader = NeighborLoader(
+        data=(feature_store, graph_store),
+        num_neighbors=[args.fan_out]*args.num_layers,
+        input_nodes=split_idx['test'],
+        replace=False,
+        batch_size=args.batch_size,
+        directory=test_dir,
+    )
+
+    model = torch_geometric.nn.models.GCN(
+            dataset.num_features,
+            args.hidden_channels,
+            args.num_layers,
+            dataset.num_classes,
+        ).to(device)
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr,
+                                weight_decay=0.0005)
+
+    warmup_steps = 20
+    def train(epoch:int):
+        model.train()
+        for i, batch in enumerate(train_loader):
+            if i == warmup_steps:
+                torch.cuda.synchronize()
+                start_avg_time = time.perf_counter()
+            batch = batch.to(device)
+
+            optimizer.zero_grad()
+            batch_size = batch.batch_size
+            out = model(batch.x, batch.edge_index)[:batch_size]
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+
+            print('y shape:', y.shape)
+            print('y:', y)
+            print('ymin:', y.min())
+            print('ymax:', y.max())
+
+            print('batch:', batch)
+            print(batch.num_sampled_nodes)
+            print(batch.num_sampled_edges)
+
+            loss = F.cross_entropy(out, y)
+            loss.backward()
+            optimizer.step()
+
+            if i % 10 == 0:
+                print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}')
+        torch.cuda.synchronize()
+        print(f'Average Training Iteration Time (s/iter): \
+                {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}')
+
+    @torch.no_grad()
+    def test(loader: NeighborLoader, val_steps: Optional[int] = None):
+        model.eval()
+
+        total_correct = total_examples = 0
+        for i, batch in enumerate(loader):
+            if val_steps is not None and i >= val_steps:
+                break
+            batch = batch.to(device)
+            batch_size = batch.batch_size
+            out = model(batch.x, batch.edge_index)[:batch_size]
+            pred = out.argmax(dim=-1)
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+
+            total_correct += int((pred == y).sum())
+            total_examples += y.size(0)
+
+        return total_correct / total_examples
+
+    torch.cuda.synchronize()
+    prep_time = round(time.perf_counter() - wall_clock_start, 2)
+    print("Total time before training begins (prep_time)=", prep_time, "seconds")
+    print("Beginning training...")
+    for epoch in range(1, 1 + args.epochs):
+        train(epoch)
+        val_acc = test(val_loader, val_steps=100)
+        print(f'Val Acc: ~{val_acc:.4f}')
 
-print(graph_store._graph)
\ No newline at end of file
+    test_acc = test(test_loader)
+    print(f'Test Acc: {test_acc:.4f}')
+    total_time = round(time.perf_counter() - wall_clock_start, 2)
+    print("Total Program Runtime (total_time) =", total_time, "seconds")
+    print("total_time - prep_time =", total_time - prep_time, "seconds")
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
index 3223df31ddf..d73c8363f22 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -49,6 +49,7 @@ def __init__(self,
         filter_per_worker: Optional[bool] = None,
         neighbor_sampler: Optional['torch_geometric.sampler.NeighborSampler'] = None,
         directed: bool = True,  # Deprecated.
+        batch_size: int =16,
         directory:str=None,
         batches_per_partition=256,
         format:str='parquet',
@@ -102,6 +103,9 @@ def __init__(self,
             directed: bool (optional, default=True)
                 Deprecated.
                 See torch_geometric.loader.NeighborLoader.
+            batch_size: int (optional, default=16)
+                The number of input nodes per output minibatch.
+                See torch.utils.dataloader.
             directory: str (optional, default=None)
                 The directory where samples will be temporarily stored.
                 It is recommend that this be set by the user, usually
@@ -132,6 +136,8 @@ def __init__(self,
                 Other keyword arguments passed to the superclass.
             """
 
+            subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
+
             if not directed:
                 subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
                 warnings.warn(
@@ -152,8 +158,7 @@ def __init__(self,
                 raise ValueError("Biased sampling is currently unsupported")
             if is_sorted:
                 warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
-
-            if not isinstance(data, Tuple[cugraph_pyg.data.FeatureStore, cugraph_pyg.data.GraphStore]):
+            if not isinstance(data, (list, tuple)) or not isinstance(data[1], cugraph_pyg.data.GraphStore):
                 # Will eventually automatically convert these objects to cuGraph objects.
                 raise NotImplementedError("Currently can't accept non-cugraph graphs")
 
@@ -181,8 +186,11 @@ def __init__(self,
                     compress_per_hop=False,
                     with_replacement=replace,
                     local_seeds_per_call=local_seeds_per_call,
-                )
+                ),
+                (feature_store, graph_store),
+                batch_size=batch_size
             )
+            # TODO add heterogeneous support and pass graph_store._vertex_offsets
 
             super().__init__(
                 (feature_store, graph_store),
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
index 811330d4dae..a8f102e984d 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -60,7 +60,7 @@ def __init__(self,
                     See torch_geometric.loader.NodeLoader.
 
             """
-            if not isinstance(data, Tuple[cugraph_pyg.data.FeatureStore, cugraph_pyg.data.GraphStore]):
+            if not isinstance(data, (list, tuple)) or not isinstance(data[1], cugraph_pyg.data.GraphStore):
                 # Will eventually automatically convert these objects to cuGraph objects.
                 raise NotImplementedError("Currently can't accept non-cugraph graphs")
             
@@ -95,10 +95,13 @@ def __init__(self,
                 input_type=input_type,
             )
 
+            self.__data = data
+
             self.__node_sampler = node_sampler
             
     
     def __iter__(self):
-        return cugraph_pyg.sampling.SampleIterator(
+        return cugraph_pyg.sampler.SampleIterator(
+            self.__data,
             self.__node_sampler.sample_from_nodes(self.__input_data)
         )
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
index 212910fa4fb..84a5ada0c7d 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -26,14 +26,14 @@ def __init__(self, data: Tuple['torch_geometric.data.FeatureStore', 'torch_geome
 
     def __next__(self):
         next_sample = next(self.__output_iter)
-        if isinstance(next_sample, 'torch_geometric.sampler.SamplerOutput'):
+        if isinstance(next_sample, torch_geometric.sampler.SamplerOutput):
             sz = next_sample.edge.numel()
             if sz == next_sample.col.numel():
                 col = next_sample.col
             else:
                 col = torch_geometric.edge_index.ptr2index(next_sample.col, next_sample.edge.numel())
             
-            data = torch_geometric.data.utils.filter_custom_store(
+            data = torch_geometric.loader.utils.filter_custom_store(
                 self.__feature_store,
                 self.__graph_store,
                 next_sample.node,
@@ -47,18 +47,17 @@ def __next__(self):
                 data.n_id = next_sample.node
             if next_sample.edge is not None and 'e_id' not in data:
                 edge = next_sample.edge.to(torch.long)
-                perm = self.node_sampler.edge_permutation
-                data.e_id = perm[edge] if perm is not None else edge
+                data.e_id = edge
 
             data.batch = next_sample.batch
             data.num_sampled_nodes = next_sample.num_sampled_nodes
             data.num_sampled_edges = next_sample.num_sampled_edges
 
-            data.input_id = next_sample.metadata[0]
-            data.seed_time = next_sample.metadata[1]
-            data.batch_size = next_sample.metadata[0].size(0)
+            data.input_id = data.batch
+            data.seed_time = None
+            data.batch_size = data.input_id.size(0)
 
-        elif isinstance(next_sample, 'torch_geometric.sampler.HeteroSamplerOutput'):
+        elif isinstance(next_sample, torch_geometric.sampler.HeteroSamplerOutput):
             col = {}
             for edge_type, col_idx in next_sample.col:
                 sz = next_sample.edge[edge_type].numel()
@@ -67,7 +66,7 @@ def __next__(self):
                 else:
                     col[edge_type] = torch_geometric.edge_index.ptr2index(col_idx, sz)
                 
-            data = torch_geometric.data.utils.filter_custom_hetero_store(
+            data = torch_geometric.loader.utils.filter_custom_hetero_store(
                 self.__feature_store,
                 self.__graph_store,
                 next_sample.node,
@@ -76,9 +75,25 @@ def __next__(self):
                 next_sample.edge,
                 None,
             )
+
+            for key, node in next_sample.node.items():
+                if 'n_id' not in data[key]:
+                    data[key].n_id = node
+
+            for key, edge in (next_sample.edge or {}).items():
+                if edge is not None and 'e_id' not in data[key]:
+                    edge = edge.to(torch.long)
+                    data[key].e_id = edge
+
+            data.set_value_dict('batch', next_sample.batch)
+            data.set_value_dict('num_sampled_nodes', next_sample.num_sampled_nodes)
+            data.set_value_dict('num_sampled_edges', next_sample.num_sampled_edges)
+
+            # TODO figure out how to set input_id for heterogeneous output
         else:
             raise ValueError("Invalid output type")
-
+        
+        return data
 
     def __iter__(self):
         return self
@@ -104,7 +119,10 @@ def __next__(self):
             self.__num_samples_remaining = end_inclusive - start_inclusive + 1
             self.__index = 0
             
-        return self._decode(self.__raw_sample_data, self.__index)
+        out = self._decode(self.__raw_sample_data, self.__index)
+        self.__index += 1
+        self.__num_samples_remaining -= 1
+        return out
 
     def __iter__(self):
         return self
@@ -114,7 +132,7 @@ def __init__(self, base_reader: DistSampleReader):
         super().__init__(base_reader)
 
     def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
-        fanout_length = len(raw_sample_data['label_hop_offsets']) - 1 // (len(raw_sample_data['renumber_map_offsets']) - 1)
+        fanout_length = (len(raw_sample_data['label_hop_offsets']) - 1) // (len(raw_sample_data['renumber_map_offsets']) - 1)
         
         major_offsets_start_incl = raw_sample_data['label_hop_offsets'][index * fanout_length]
         major_offsets_end_incl = raw_sample_data['label_hop_offsets'][(index + 1) * fanout_length]
@@ -133,15 +151,16 @@ def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
         current_label_hop_offsets -= current_label_hop_offsets[0].clone()
 
         num_sampled_edges = major_offsets[current_label_hop_offsets].diff()
-        num_sampled_nodes = torch.concat(
-            [
-                current_label_hop_offsets.diff(),
-                (renumber_map.shape[0] - current_label_hop_offsets[-1]).reshape((1,)),
-            ]
-        )
+        
+        print('lho:', current_label_hop_offsets)
+        num_sampled_nodes = current_label_hop_offsets.diff()
+        num_sampled_nodes = torch.concat([
+            num_sampled_nodes.clone(),
+            (renumber_map.shape[0] - num_sampled_nodes.sum()).reshape((1,)),
+        ])
 
         return torch_geometric.sampler.SamplerOutput(
-            node=renumber_map,
+            node=renumber_map.cpu(),
             row=minors,
             col=major_offsets,
             edge=edge_id,
@@ -169,7 +188,7 @@ def __decode_coo(raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
         num_sampled_edges = raw_sample_data['label_hop_offsets'][index * fanout_length : (index + 1) * fanout_length + 1].diff().cpu()
 
         return torch_geometric.sampler.SamplerOutput(
-            node=renumber_map,
+            node=renumber_map.cpu(),
             row=minors,
             col=majors,
             edge=edge_id,
@@ -185,13 +204,15 @@ def _decode(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
             return self.__decode_coo(raw_sample_data, index)
 
 class BaseSampler:
-    def __init__(self, sampler: DistSampler, data: Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore']):
+    def __init__(self, sampler: DistSampler, data: Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore'], batch_size:int=16):
         self.__sampler = sampler
         self.__feature_store, self.__graph_store = data
+        self.__batch_size = batch_size
 
     def sample_from_nodes(self, index: 'torch_geometric.sampler.NodeSamplerInput', **kwargs) -> Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]:
         self.__sampler.sample_from_nodes(
             index.node,
+            batch_size=self.__batch_size,
             **kwargs
         )
 
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index 1f864d02ae4..e6dc342868b 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -64,7 +64,9 @@ def __next__(self):
             df = cudf.read_parquet(os.path.join(self.__directory, fname))
             tensors = {}
             for col in list(df.columns):
-                tensors[col] = torch.as_tensor(df[col].dropna(), device='cuda')
+                s = df[col].dropna()
+                if len(s) > 0:
+                    tensors[col] = torch.as_tensor(s, device='cuda')
                 df.drop(col, axis=1, inplace=True)
             
             return tensors, start_inclusive, end_inclusive
@@ -242,7 +244,7 @@ def __write_minibatches_csr(self, minibatch_dict):
             ]
 
             batch_id_array_p = minibatch_dict["batch_id"][partition_start:partition_end]
-            start_batch_id = batch_id_array_p[0] - rank_batch_offset
+            start_batch_id = batch_id_array_p[0]
 
             # major offsets and minors
             major_offsets_start_incl, major_offsets_end_incl = label_hop_offsets_array_p[[0, -1]]
@@ -329,7 +331,7 @@ def __init__(
         graph: Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph],
         writer: DistSampleWriter,
         local_seeds_per_call: int = 32768,
-        retain_original_seeds: bool = False,  # TODO See #4329, needs C API
+        retain_original_seeds: bool = False, 
     ):
         """
         Parameters
@@ -666,12 +668,13 @@ def sample_batches(
                 local_label_list, assume_equal_input_size=assume_equal_input_size
             )
 
-            # TODO add calculation of seed vertex label offsets
             if self._retain_original_seeds:
-                warnings.warn(
-                    "The 'retain_original_seeds` parameter is currently ignored "
-                    "since seed retention is not implemented yet."
-                )
+                label_offsets = torch.concat([
+                    torch.searchsorted(batch_ids, local_label_list),
+                    torch.tensor([batch_ids.shape[0]], device='cuda', dtype=torch.int64)
+                ])
+            else:
+                label_offsets = None
 
             sampling_results_dict = pylibcugraph.uniform_neighbor_sample(
                 self._resource_handle,
@@ -691,10 +694,22 @@ def sample_batches(
                 renumber=True,
                 compression=self.__compression,
                 compress_per_hop=self.__compress_per_hop,
+                retain_seeds=self._retain_original_seeds,
+                label_offsets=cupy.asarray(label_offsets),
                 return_dict=True,
             )
             sampling_results_dict["rank"] = rank
         else:
+            if self._retain_original_seeds:
+                batch_ids = batch_ids.to(device="cuda", dtype=torch.int32)
+                local_label_list = torch.unique(batch_ids)
+                label_offsets = torch.concat([
+                    torch.searchsorted(batch_ids, local_label_list),
+                    torch.tensor([batch_ids.shape[0]], device='cuda', dtype=torch.int64)
+                ])
+            else:
+                label_offsets = None
+
             sampling_results_dict = pylibcugraph.uniform_neighbor_sample(
                 self._resource_handle,
                 self._graph,
@@ -711,6 +726,8 @@ def sample_batches(
                 renumber=True,
                 compression=self.__compression,
                 compress_per_hop=self.__compress_per_hop,
+                retain_seeds = self._retain_original_seeds,
+                label_offsets = cupy.asarray(label_offsets),
                 return_dict=True,
             )
 

From b64f60959d7afc4e11e71424b63b5481e09d3b6f Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 7 May 2024 16:31:22 -0400
Subject: [PATCH 37/80] Merge in changes from
 https://github.com/rapidsai/cugraph/pull/4312

---
 ci/test_wheel_cugraph-pyg.sh                              | 2 +-
 conda/recipes/cugraph-pyg/meta.yaml                       | 2 +-
 .../cugraph_pyg/tests/mg/test_mg_cugraph_store.py         | 7 ++++++-
 .../cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py  | 8 ++++++--
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index e98bf4ab56b..f45112dd80b 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -33,7 +33,7 @@ else
 fi
 rapids-logger "Installing PyTorch and PyG dependencies"
 rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
-rapids-retry python -m pip install torch-geometric==2.4.0
+rapids-retry python -m pip install "torch-geometric>=2.5,<2.6"
 rapids-retry python -m pip install \
   ogb \
   pyg_lib \
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 818616c2f5b..c02e8391eb2 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -34,7 +34,7 @@ requirements:
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
-    - pyg >=2.3,<2.5
+    - pyg >=2.5,<2.6
 
 tests:
   imports:
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index 7047c62250b..85acbebc3ec 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -373,10 +373,15 @@ def test_get_input_nodes(karate_gnn, dask_client):
     F, G, N = karate_gnn
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
-    node_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
+    nodes = torch_geometric.loader.utils.get_input_nodes(
         (cugraph_store, cugraph_store), "type0"
     )
 
+    if len(nodes) == 2:
+        node_type, input_nodes = nodes
+    else:
+        node_type, input_nodes, _ = nodes
+
     assert node_type == "type0"
     assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 9813fa933ee..61715999c2e 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -32,7 +32,11 @@
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
+
 trim_to_layer = import_optional("torch_geometric.utils.trim_to_layer")
+if isinstance(trim_to_layer, MissingModule):
+    trim_to_layer = import_optional("torch_geometric.utils._trim_to_layer")
+
 
 try:
     import torch_sparse  # noqa: F401
@@ -333,8 +337,8 @@ def test_cugraph_loader_e2e_coo():
     for hetero_data in loader:
         ei = hetero_data["t0", "knows", "t0"]["edge_index"]
         x = hetero_data["t0"]["x"].cuda()
-        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
-        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
+        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"].tolist()
+        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"].tolist()
 
         for i in range(len(convs)):
             x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)

From 5c79b4fed92e634220d4da459ba8605ef38ddc10 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 7 May 2024 16:33:29 -0400
Subject: [PATCH 38/80] style

---
 python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 61715999c2e..fe91d378d87 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at

From 393734230bf23d12e09d99bc21c7afd58505cf29 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 7 May 2024 17:12:50 -0400
Subject: [PATCH 39/80] style

---
 python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index fe91d378d87..75549e9d313 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -338,7 +338,9 @@ def test_cugraph_loader_e2e_coo():
         ei = hetero_data["t0", "knows", "t0"]["edge_index"]
         x = hetero_data["t0"]["x"].cuda()
         num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"].tolist()
-        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"].tolist()
+        num_sampled_edges = hetero_data["t0", "knows", "t0"][
+            "num_sampled_edges"
+        ].tolist()
 
         for i in range(len(convs)):
             x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)

From b807185d49636ecc623f5a90e911a23a8362e806 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 7 May 2024 15:54:47 -0700
Subject: [PATCH 40/80] changes to get sg example working, use CSC format

---
 .../cugraph_pyg/examples/gcn_dist_mg.py       |  0
 .../{products_dist_sg.py => gcn_dist_sg.py}   | 11 +----
 .../cugraph_pyg/loader/neighbor_loader.py     | 18 +++++--
 .../cugraph_pyg/sampler/sampler.py            | 43 +++++++++++-----
 .../cugraph/gnn/data_loading/dist_sampler.py  | 49 ++++++++++++++-----
 5 files changed, 85 insertions(+), 36 deletions(-)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
 rename python/cugraph-pyg/cugraph_pyg/examples/{products_dist_sg.py => gcn_dist_sg.py} (94%)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
similarity index 94%
rename from python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py
rename to python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
index 3bf45ba9da1..550b9a600a7 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/products_dist_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
@@ -15,7 +15,7 @@
 # Must change allocators immediately upon import
 # or else other imports will cause memory to be
 # allocated and prevent changing the allocator
-rmm.reinitialize(devices=[0], pool_allocator=False, managed_memory=True)
+rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
 cupy.cuda.set_allocator(rmm_cupy_allocator)
 torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
@@ -115,15 +115,6 @@ def train(epoch:int):
             out = model(batch.x, batch.edge_index)[:batch_size]
             y = batch.y[:batch_size].view(-1).to(torch.long)
 
-            print('y shape:', y.shape)
-            print('y:', y)
-            print('ymin:', y.min())
-            print('ymax:', y.max())
-
-            print('batch:', batch)
-            print(batch.num_sampled_nodes)
-            print(batch.num_sampled_edges)
-
             loss = F.cross_entropy(out, y)
             loss.backward()
             optimizer.step()
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
index d73c8363f22..47d8065abce 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -53,7 +53,8 @@ def __init__(self,
         directory:str=None,
         batches_per_partition=256,
         format:str='parquet',
-        local_seeds_per_call: int=32768,
+        compression:Optional[str]=None,
+        local_seeds_per_call: Optional[int]=None,
         **kwargs,):
             """
             data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
@@ -125,12 +126,16 @@ def __init__(self,
                 If writing samples to disk, they will be written in this
                 file format.
                 See cugraph.gnn.DistSampleWriter.
-            local_seeds_per_call: int (optional, default=32768)
+            compression: str (optional, default=None)
+                The compression type to use if writing samples to disk.
+                If not provided, it is automatically chosen.
+            local_seeds_per_call: int (optional, default=None)
                 The number of seeds to process within a single sampling call.
                 Manually tuning this parameter is not recommended but reducing
                 it may conserve GPU memory.  The total number of seeds processed
                 per sampling call is equal to the sum of this parameter across
-                all workers.
+                all workers.  If not provided, it will be automatically
+                calculated.
                 See cugraph.gnn.DistSampler.
             **kwargs
                 Other keyword arguments passed to the superclass.
@@ -167,6 +172,11 @@ def __init__(self,
                 self._tempdir = tempfile.TemporaryDirectory()
                 directory = self._tempdir.name
 
+            if compression is None:
+                compression = "CSR"
+            elif compression not in ["CSR", "COO"]:
+                raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
+
             writer = DistSampleWriter(
                 directory=directory,
                 batches_per_partition=batches_per_partition,
@@ -182,7 +192,7 @@ def __init__(self,
                     fanout=num_neighbors,
                     prior_sources_behavior='exclude',
                     deduplicate_sources=True,
-                    compression="CSR",
+                    compression=compression,
                     compress_per_hop=False,
                     with_replacement=replace,
                     local_seeds_per_call=local_seeds_per_call,
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
index 84a5ada0c7d..8422a38563c 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -132,7 +132,7 @@ def __init__(self, base_reader: DistSampleReader):
         super().__init__(base_reader)
 
     def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
-        fanout_length = (len(raw_sample_data['label_hop_offsets']) - 1) // (len(raw_sample_data['renumber_map_offsets']) - 1)
+        fanout_length = (raw_sample_data['label_hop_offsets'].numel() - 1) // (raw_sample_data['renumber_map_offsets'].numel() - 1)
         
         major_offsets_start_incl = raw_sample_data['label_hop_offsets'][index * fanout_length]
         major_offsets_end_incl = raw_sample_data['label_hop_offsets'][(index + 1) * fanout_length]
@@ -142,6 +142,8 @@ def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
         edge_id = raw_sample_data['edge_id'][major_offsets[0] : major_offsets[-1]]
         # don't retrieve edge type for a homogeneous graph
 
+        major_offsets -= major_offsets[0].clone()
+
         renumber_map_start = raw_sample_data['renumber_map_offsets'][index]
         renumber_map_end = raw_sample_data['renumber_map_offsets'][index + 1]
 
@@ -152,11 +154,15 @@ def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
 
         num_sampled_edges = major_offsets[current_label_hop_offsets].diff()
         
-        print('lho:', current_label_hop_offsets)
-        num_sampled_nodes = current_label_hop_offsets.diff()
+        num_sampled_nodes_hops = torch.tensor([
+            minors[:num_sampled_edges[:i].sum()].max() + 1
+            for i in range(1, fanout_length + 1)
+        ], device='cpu')
+        
+        num_seeds = torch.searchsorted(major_offsets, num_sampled_edges[0]).reshape((1,)).cpu()
         num_sampled_nodes = torch.concat([
-            num_sampled_nodes.clone(),
-            (renumber_map.shape[0] - num_sampled_nodes.sum()).reshape((1,)),
+            num_seeds,
+            num_sampled_nodes_hops.diff(prepend=num_seeds)
         ])
 
         return torch_geometric.sampler.SamplerOutput(
@@ -164,16 +170,20 @@ def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
             row=minors,
             col=major_offsets,
             edge=edge_id,
-            batch=renumber_map[:num_sampled_nodes[0]],
+            batch=renumber_map[:num_seeds],
             num_sampled_nodes=num_sampled_nodes.cpu(),
             num_sampled_edges=num_sampled_edges.cpu(),
         )
 
-    def __decode_coo(raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
-        fanout_length = len(raw_sample_data['label_hop_offsets']) - 1 // (len(raw_sample_data['renumber_map_offsets']) - 1)
+    def __decode_coo(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
+        fanout_length = (raw_sample_data['label_hop_offsets'].numel() - 1) // (raw_sample_data['renumber_map_offsets'].numel() - 1)
         
         major_minor_start = raw_sample_data['label_hop_offsets'][index * fanout_length]
-        major_minor_end = raw_sample_data['label_hop_offsets'][(index + 1) * fanout_length]
+        ix_end = (index + 1) * fanout_length
+        if ix_end == raw_sample_data['label_hop_offsets'].numel():
+            major_minor_end = raw_sample_data['majors'].numel()
+        else:
+            major_minor_end = raw_sample_data['label_hop_offsets'][ix_end]
 
         majors = raw_sample_data['majors'][major_minor_start:major_minor_end]
         minors = raw_sample_data['minors'][major_minor_start:major_minor_end]
@@ -186,14 +196,25 @@ def __decode_coo(raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
         renumber_map = raw_sample_data['map'][renumber_map_start:renumber_map_end]
 
         num_sampled_edges = raw_sample_data['label_hop_offsets'][index * fanout_length : (index + 1) * fanout_length + 1].diff().cpu()
+        
+        num_seeds = (majors[:num_sampled_edges[0]].max() + 1).reshape((1,)).cpu()
+        num_sampled_nodes_hops = torch.tensor([
+            minors[:num_sampled_edges[:i].sum()].max() + 1
+            for i in range(1, fanout_length + 1)
+        ], device='cpu')
+        
+        num_sampled_nodes = torch.concat([
+            num_seeds,
+            num_sampled_nodes_hops.diff(prepend=num_seeds)
+        ])
 
         return torch_geometric.sampler.SamplerOutput(
             node=renumber_map.cpu(),
             row=minors,
             col=majors,
             edge=edge_id,
-            batch=None,
-            num_sampled_nodes=None,
+            batch=renumber_map[:num_seeds],
+            num_sampled_nodes=num_sampled_nodes,
             num_sampled_edges=num_sampled_edges,
         )
 
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index e6dc342868b..d81db658b99 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -15,13 +15,14 @@
 import re
 import warnings
 from math import ceil
+from functools import reduce
 
 import pylibcugraph
 import numpy as np
 import cupy
 import cudf
 
-from typing import Union, List, Dict, Tuple, Iterator
+from typing import Union, List, Dict, Tuple, Iterator, Optional
 
 from cugraph.utilities import import_optional
 from cugraph.gnn.comms import cugraph_comms_get_raft_handle
@@ -330,7 +331,7 @@ def __init__(
         self,
         graph: Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph],
         writer: DistSampleWriter,
-        local_seeds_per_call: int = 32768,
+        local_seeds_per_call: int,
         retain_original_seeds: bool = False, 
     ):
         """
@@ -341,14 +342,16 @@ def __init__(
         writer: DistSampleWriter (required)
             The writer responsible for writing samples to disk
             or, in the future, device or host memory.
-        local_seeds_per_call: int (optional, default=32768)
+        local_seeds_per_call: int
             The number of seeds on this rank this sampler will
             process in a single sampling call.  Batches will
             get split into multiple sampling calls based on
             this parameter.  This parameter must
             be the same across all ranks.  The total number
             of seeds processed per sampling call is this
-            parameter times the world size.
+            parameter times the world size. Subclasses should
+            generally calculate the appropriate number of
+            seeds.
         retain_original_seeds: bool (optional, default=False)
             Whether to retain the original seeds even if they
             do not appear in the output minibatch.  This will
@@ -361,6 +364,7 @@ def __init__(
         self.__handle = None
         self.__retain_original_seeds = retain_original_seeds
 
+
     def get_reader(self) -> Iterator[Tuple[Dict[str, 'torch.Tensor'], int, int]]:
         """
         Returns an iterator over sampled data.
@@ -624,12 +628,20 @@ def _retain_original_seeds(self):
 
 
 class UniformNeighborSampler(DistSampler):
+    # Number of vertices in the output minibatch, based
+    # on benchmarking.
+    BASE_VERTICES_PER_BYTE = 0.1107662486009992
+
+    # Default number of seeds if the output minibatch
+    # size can't be estimated.
+    UNKNOWN_VERTICES_DEFAULT = 32768
+
     def __init__(
         self,
         graph: Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph],
         writer: DistSampleWriter,
         *,
-        local_seeds_per_call: int = 32768,
+        local_seeds_per_call: Optional[int] = None,
         retain_original_seeds: bool = False,
         fanout: List[int] = [-1],
         prior_sources_behavior: str = "exclude",
@@ -638,12 +650,6 @@ def __init__(
         compress_per_hop: bool = False,
         with_replacement: bool = False,
     ):
-        super().__init__(
-            graph,
-            writer,
-            local_seeds_per_call=local_seeds_per_call,
-            retain_original_seeds=retain_original_seeds,
-        )
         self.__fanout = fanout
         self.__prior_sources_behavior = prior_sources_behavior
         self.__deduplicate_sources = deduplicate_sources
@@ -651,6 +657,27 @@ def __init__(
         self.__compression = compression
         self.__with_replacement = with_replacement
 
+        super().__init__(
+            graph,
+            writer,
+            local_seeds_per_call=self.__calc_local_seeds_per_call(local_seeds_per_call),
+            retain_original_seeds=retain_original_seeds,
+        )
+
+    def __calc_local_seeds_per_call(self, local_seeds_per_call: Optional[int]=None):
+        if local_seeds_per_call is None:
+            if len([x for x in self.__fanout if x <= 0]) > 0:
+                return UniformNeighborSampler.UNKNOWN_VERTICES_DEFAULT
+            
+            total_memory = torch.cuda.get_device_properties(0).total_memory
+            fanout_prod = reduce(lambda x, y : x * y, self.__fanout)
+            return int(
+                UniformNeighborSampler.BASE_VERTICES_PER_BYTE * total_memory / fanout_prod           
+            )
+        
+        return local_seeds_per_call
+
+
     def sample_batches(
         self,
         seeds: TensorType,

From b93d82c880e3eadd5662bfc9f9ff2038899820a9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 8 May 2024 13:01:39 -0700
Subject: [PATCH 41/80] deal with input mismatch, clean up mg example

---
 .../cugraph_pyg/data/graph_store.py           |  42 ++-
 .../cugraph_pyg/examples/gcn_dist_mg.py       | 259 ++++++++++++++++++
 .../cugraph/gnn/data_loading/dist_sampler.py  |  36 ++-
 3 files changed, 314 insertions(+), 23 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 37f0a3f4424..7a761729cf9 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -128,14 +128,22 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
 
         if self.__graph is None:
             edgelist_dict = self.__get_edgelist()
+
             if self.is_multi_gpu:
+                rank = torch.distributed.get_rank()
+                world_size = torch.distributed.get_world_size()
+
+                vertices_array=cupy.arange(sum(self._num_vertices().values()), dtype='int64')
+                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
+               
                 self.__graph = pylibcugraph.MGGraph(
                     self._resource_handle,
                     graph_properties,
                     [cupy.asarray(edgelist_dict['src'])],
                     [cupy.asarray(edgelist_dict['dst'])],
-                    edge_id_array=cupy.asarray(edgelist_dict['eid']),
-                    edge_type_array=cupy.asarray(edgelist_dict['etp']),
+                    vertices_array=[vertices_array],
+                    edge_id_array=[cupy.asarray(edgelist_dict['eid'])],
+                    edge_type_array=[cupy.asarray(edgelist_dict['etp'])],
                 )
             else:
                 self.__graph = pylibcugraph.SGGraph(
@@ -143,26 +151,31 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                     graph_properties,
                     cupy.asarray(edgelist_dict['src']),
                     cupy.asarray(edgelist_dict['dst']),
+                    vertices_array=cupy.arange(sum(self._num_vertices.values()), dtype='int64'),
                     edge_id_array=cupy.asarray(edgelist_dict['eid']),
                     edge_type_array=cupy.asarray(edgelist_dict['etp']),
                 )
         
         return self.__graph
 
+    def _num_vertices(self) -> Dict[str, int]:
+        num_vertices = {}
+        for edge_attr in self.get_all_edge_attrs():
+            if edge_attr.size is not None:
+                num_vertices[edge_attr.edge_type[0]] = max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0]) if edge_attr.edge_type[0] in num_vertices else edge_attr.size[0]
+                num_vertices[edge_attr.edge_type[2]] = max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1]) if edge_attr.edge_type[2] in num_vertices else edge_attr.size[1]
+            else:
+                if edge_attr.edge_type[0] not in num_vertices:
+                    num_vertices[edge_attr.edge_type[0]] = self.__edge_indices[edge_attr.edge_type][0].max() + 1
+                if edge_attr.edge_type[2] not in num_vertices:
+                    num_vertices[edge_attr.edge_type[1]] = self.__edge_indices[edge_attr.edge_type][1].max() + 1
+        
+        return num_vertices
+
     @property
     def _vertex_offsets(self) -> Dict[str, int]:
-        if self.__vertex_offsets is None:
-            num_vertices = {}
-            for edge_attr in self.get_all_edge_attrs():
-                if edge_attr.size is not None:
-                    num_vertices[edge_attr.edge_type[0]] = max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0]) if edge_attr.edge_type[0] in num_vertices else edge_attr.size[0]
-                    num_vertices[edge_attr.edge_type[2]] = max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1]) if edge_attr.edge_type[2] in num_vertices else edge_attr.size[1]
-                else:
-                    if edge_attr.edge_type[0] not in num_vertices:
-                        num_vertices[edge_attr.edge_type[0]] = self.__edge_indices[edge_attr.edge_type][0].max() + 1
-                    if edge_attr.edge_type[2] not in num_vertices:
-                        num_vertices[edge_attr.edge_type[1]] = self.__edge_indices[edge_attr.edge_type][1].max() + 1
-            
+        if self.__vertex_offsets is None:            
+            num_vertices = self._num_vertices()
             ordered_keys = sorted(list(num_vertices.keys()))
             self.__vertex_offsets = {}
             offset = 0
@@ -203,6 +216,7 @@ def __get_edgelist(self):
             ]) for (dst_type,rel_type,src_type) in sorted_keys
         ], axis=1).cuda()
 
+        # FIXME this is not calculated correctly
         edge_type_array = torch.arange(len(sorted_keys), dtype=torch.int32, device='cuda').repeat_interleave(torch.tensor([
             self.__edge_indices[et].shape[1] for et in sorted_keys
         ], device='cuda', dtype=torch.int32))
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
index e69de29bb2d..e56dc5e190b 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
@@ -0,0 +1,259 @@
+import argparse
+import os
+import tempfile
+import time
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn.functional as F
+from ogb.nodeproppred import PygNodePropPredDataset
+from torch.nn.parallel import DistributedDataParallel
+
+import torch_geometric
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+# Allow computation on objects that are larger than GPU memory
+# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
+os.environ['CUDF_SPILL'] = '1'
+
+# Ensures that a CUDA context is not created on import of rapids.
+# Allows pytorch to create the context instead
+os.environ['RAPIDS_NO_INITIALIZE'] = '1'
+
+def init_pytorch_worker(rank, world_size, cugraph_id):
+    import rmm
+    rmm.reinitialize(devices=rank, managed_memory=True, pool_allocator=True,)
+
+    import cupy
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+    enable_spilling()
+
+    torch.cuda.set_device(rank)
+
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+    dist.init_process_group('nccl', rank=rank, world_size=world_size)
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+
+
+def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan_out,
+              split_idx, num_classes, wall_clock_start, tempdir=None,
+              num_layers=3):
+
+    init_pytorch_worker(
+        rank,
+        world_size,
+        cugraph_id,
+    )
+
+    model = model.to(rank)
+    model = DistributedDataParallel(model, device_ids=[rank])
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01,
+                                 weight_decay=0.0005)
+
+    kwargs = dict(
+        num_neighbors=[fan_out] * num_layers,
+        batch_size=batch_size,
+    )
+    # Set Up Neighbor Loading
+    from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
+    from cugraph_pyg.loader import NeighborLoader
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    ixr = torch.tensor_split(data.edge_index, world_size, axis=1)[rank]
+    graph_store[('node','rel','node'), 'coo', False, (data.num_nodes, data.num_nodes)] = ixr
+
+    feature_store = TensorDictFeatureStore()
+    feature_store['node', 'x'] = data.x
+    feature_store['node', 'y'] = data.y
+
+    dist.barrier()
+
+    ix_train = torch.tensor_split(split_idx['train'], world_size)[rank].cuda()
+    train_path = os.path.join(tempdir, f'train_{rank}')
+    os.mkdir(train_path)
+    train_loader = NeighborLoader(
+        (feature_store, graph_store),
+        input_nodes=ix_train,
+        directory=train_path,
+        shuffle=True,
+        drop_last=True,
+        **kwargs
+    )
+
+    ix_test = torch.tensor_split(split_idx['test'], world_size)[rank].cuda()
+    test_path = os.path.join(tempdir, f'test_{rank}')
+    os.mkdir(test_path)
+    test_loader = NeighborLoader(
+        (feature_store, graph_store),
+        input_nodes=ix_test,
+        directory=test_path,
+        shuffle=True,
+        drop_last=True,
+        local_seeds_per_call=80000,
+        **kwargs
+    )
+
+    ix_valid = torch.tensor_split(split_idx['valid'], world_size)[rank].cuda()
+    valid_path = os.path.join(tempdir, f'valid_{rank}')
+    os.mkdir(valid_path)
+    valid_loader = NeighborLoader(
+        (feature_store, graph_store),
+        input_nodes=ix_valid,
+        directory=valid_path,
+        shuffle=True,
+        drop_last=True,
+        **kwargs
+    )
+
+    dist.barrier()
+
+    eval_steps = 1000
+    warmup_steps = 20
+    dist.barrier()
+    torch.cuda.synchronize()
+    
+    if rank == 0:
+        prep_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total time before training begins (prep_time) =", prep_time,
+              "seconds")
+        print("Beginning training...")
+    for epoch in range(epochs):
+        for i, batch in enumerate(train_loader):
+            if i == warmup_steps:
+                torch.cuda.synchronize()
+                start = time.time()
+            
+            batch = batch.to(rank)
+            batch_size = batch.batch_size
+
+            batch.y = batch.y.to(torch.long)
+            optimizer.zero_grad()
+            out = model(batch.x, batch.edge_index)
+            loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size])
+            loss.backward()
+            optimizer.step()
+            if rank == 0 and i % 10 == 0:
+                print("Epoch: " + str(epoch) + ", Iteration: " + str(i) +
+                        ", Loss: " + str(loss))
+        nb = i + 1.0
+        
+        if rank == 0:
+            print("Average Training Iteration Time:",
+                  (time.time() - start) / (nb - warmup_steps), "s/iter")
+
+        with torch.no_grad():
+            total_correct = total_examples = 0
+            for i, batch in enumerate(valid_loader):
+                if i >= eval_steps:
+                    break
+
+                batch = batch.to(rank)
+                batch_size = batch.batch_size
+
+                batch.y = batch.y.to(torch.long)
+                out = model(batch.x, batch.edge_index)[:batch_size]
+                
+                pred = out.argmax(dim=-1)
+                y = batch.y[:batch_size].view(-1).to(torch.long)
+
+                total_correct += int((pred == y).sum())
+                total_examples += y.size(0)
+
+            acc_val = total_correct / total_examples
+            if rank == 0:
+                print(f"Validation Accuracy: {acc_val * 100.0:.4f}%", )
+        
+        torch.cuda.synchronize()
+
+    with torch.no_grad():
+        total_correct = total_examples = 0
+        for i, batch in enumerate(test_loader):
+            batch = batch.to(rank)
+            batch_size = batch.batch_size
+
+            batch.y = batch.y.to(torch.long)
+            out = model(batch.x, batch.edge_index)[:batch_size]
+
+            pred = out.argmax(dim=-1)
+            y = batch.y[:batch_size].view(-1).to(torch.long)
+
+            total_correct += int((pred == y).sum())
+            total_examples += y.size(0)
+
+        acc_test = total_correct / total_examples
+        if rank == 0:
+            print(f"Test Accuracy: {acc_test * 100.0:.4f}%", )
+    #dist.barrier()
+
+    if rank == 0:
+        total_time = round(time.perf_counter() - wall_clock_start, 2)
+        print("Total Program Runtime (total_time) =", total_time, "seconds")
+        print("total_time - prep_time =", total_time - prep_time, "seconds")
+
+    cugraph_comms_shutdown()
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hidden_channels', type=int, default=256)
+    parser.add_argument('--num_layers', type=int, default=2)
+    parser.add_argument('--lr', type=float, default=0.001)
+    parser.add_argument('--epochs', type=int, default=4)
+    parser.add_argument('--batch_size', type=int, default=1024)
+    parser.add_argument('--fan_out', type=int, default=30)
+    parser.add_argument('--tempdir_root', type=str, default=None)
+    parser.add_argument('--dataset_root', type=str, default='dataset')
+    parser.add_argument('--dataset', type=str, default='ogbn-products')
+
+    parser.add_argument(
+        "--n_devices", type=int, default=-1,
+        help="1-8 to use that many GPUs. Defaults to all available GPUs")
+
+    args = parser.parse_args()
+    wall_clock_start = time.perf_counter()
+
+    from rmm.allocators.torch import rmm_torch_allocator
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+
+    dataset = PygNodePropPredDataset(name=args.dataset,
+                                     root=args.dataset_root)
+    split_idx = dataset.get_idx_split()
+    data = dataset[0]
+    data.y = data.y.reshape(-1)
+
+    model = torch_geometric.nn.models.GCN(dataset.num_features,
+                                            args.hidden_channels,
+                                            args.num_layers,
+                                            dataset.num_classes)
+
+    print("Data =", data)
+    if args.n_devices == -1:
+        world_size = torch.cuda.device_count()
+    else:
+        world_size = args.n_devices
+    print('Let\'s use', world_size, 'GPUs!')
+
+    # Create the uid needed for cuGraph comms
+    cugraph_id = cugraph_comms_create_unique_id()
+
+    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
+        mp.spawn(
+            run_train,
+            args=(data, world_size, cugraph_id, model, args.epochs, args.batch_size,
+                    args.fan_out, split_idx, dataset.num_classes,
+                    wall_clock_start, tempdir, args.num_layers),
+            nprocs=world_size, join=True)
\ No newline at end of file
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index d81db658b99..89c0118e946 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -36,21 +36,33 @@
 
 
 class DistSampleReader:
-    def __init__(self, directory:str, *, format: str = "parquet", rank:int = 0):
+    def __init__(self, directory:str, *, format: str = "parquet", rank:Optional[int] = None, filelist=None):
         self.__format = format
         self.__directory = directory
 
         if format != "parquet":
             raise ValueError("Invalid format (currently supported: 'parquet')")
         
-        files = os.listdir(directory)
-        ex = re.compile(r'batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet')
-        filematch = [ex.match(f) for f in files]
-        filematch = [f for f in filematch if f]
-        filematch = [f for f in filematch if int(f[1]) == rank]
-        filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)
+        if filelist is None:
+            files = os.listdir(directory)
+            ex = re.compile(r'batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet')
+            filematch = [ex.match(f) for f in files]
+            filematch = [f for f in filematch if f]
+            filematch = [f for f in filematch if int(f[1]) == rank]
+
+            batch_count = sum([int(f[4]) - int(f[2]) + 1 for f in filematch])
+            filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)
+            
+            self.__files = filematch
+        else:
+            self.__files = list(filelist)
         
-        self.__files = filematch
+        if rank is None:
+            self.__batch_count = batch_count
+        else:
+            batch_count = torch.tensor([batch_count], device='cuda')
+            torch.distributed.all_reduce(batch_count, torch.distributed.ReduceOp.MIN)
+            self.__batch_count = int(batch_count)
     
     def __iter__(self):
         return self
@@ -62,6 +74,12 @@ def __next__(self):
             start_inclusive = int(f[2])
             end_inclusive = int(f[4])
 
+            if(end_inclusive - start_inclusive + 1) > self.__batch_count:
+                end_inclusive = start_inclusive + self.__batch_count - 1
+                self.__batch_count = 0
+            else:
+                self.__batch_count -= (end_inclusive - start_inclusive + 1)
+
             df = cudf.read_parquet(os.path.join(self.__directory, fname))
             tensors = {}
             for col in list(df.columns):
@@ -712,7 +730,7 @@ def sample_batches(
                 label_to_output_comm_rank=cupy.asarray(label_to_output_comm_rank),
                 h_fan_out=np.array(self.__fanout, dtype="int32"),
                 with_replacement=self.__with_replacement,
-                do_expensive_check=False,
+                do_expensive_check=True,
                 with_edge_properties=True,
                 random_state=random_state + rank,
                 prior_sources_behavior=self.__prior_sources_behavior,

From 55ee4bd123ba05767c4d374aa8c37e1329ab7754 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 8 May 2024 13:04:40 -0700
Subject: [PATCH 42/80] fix sg reader

---
 python/cugraph/cugraph/gnn/data_loading/dist_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index 89c0118e946..11f7940df54 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -387,7 +387,7 @@ def get_reader(self) -> Iterator[Tuple[Dict[str, 'torch.Tensor'], int, int]]:
         """
         Returns an iterator over sampled data.
         """
-        rank = torch.distributed.get_rank() if self.is_multi_gpu else 0
+        rank = torch.distributed.get_rank() if self.is_multi_gpu else None
         return self.__writer.get_reader(rank)
 
     def sample_batches(

From 38578c905289d5a7db5b89de6d77f30ebb9ebfa1 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 8 May 2024 13:25:17 -0700
Subject: [PATCH 43/80] support shuffle, drop last

---
 .../cugraph_pyg/loader/neighbor_loader.py     |  1 +
 .../cugraph_pyg/loader/node_loader.py         | 26 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
index 47d8065abce..505a310114b 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -210,5 +210,6 @@ def __init__(self,
                 transform=transform,
                 transform_sampler_output=transform_sampler_output,
                 filter_per_worker=filter_per_worker,
+                batch_size=batch_size,
                 **kwargs,
             )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
index a8f102e984d..e39d9c08c3e 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -19,6 +19,7 @@
 from cugraph.utilities.utils import import_optional
 
 torch_geometric = import_optional('torch_geometric')
+torch = import_optional('torch')
 
 class NodeLoader:
     """
@@ -35,6 +36,9 @@ def __init__(self,
         filter_per_worker: Optional[bool] = None,
         custom_cls: Optional['torch_geometric.data.HeteroData'] = None,
         input_id: 'torch_geometric.typing.OptTensor' = None,
+        batch_size: int = 1,
+        shuffle: bool = False,
+        drop_last: bool = False, 
         **kwargs,):
             """
             Parameters
@@ -98,10 +102,30 @@ def __init__(self,
             self.__data = data
 
             self.__node_sampler = node_sampler
+
+            self.__batch_size = batch_size
+            self.__shuffle = shuffle
+            self.__drop_last = drop_last
             
     
     def __iter__(self):
+        if self.__shuffle:
+            perm = torch.randperm(self.__input_data.node.numel())
+        else:
+            perm = torch.arange(self.__input_data.node.numel())
+        
+        if self.__drop_last:
+            d = perm.numel() % self.__batch_size
+            perm = perm[:-d]
+        
+        input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
+            input_id=self.__input_data.input_id[perm],
+            node=self.__input_data.node[perm],
+            time=None if self.__input_data.time is None else self.__input_data.time[perm],
+            input_type=self.__input_data.input_type,
+        )
+            
         return cugraph_pyg.sampler.SampleIterator(
             self.__data,
-            self.__node_sampler.sample_from_nodes(self.__input_data)
+            self.__node_sampler.sample_from_nodes(input_data)
         )
\ No newline at end of file

From 92d811f022903c5007a9a1823e96b423995188e2 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 8 May 2024 14:53:16 -0700
Subject: [PATCH 44/80] correctly calculate edge ids

---
 .../cugraph_pyg/data/graph_store.py           | 37 ++++++++++++++++---
 .../cugraph_pyg/loader/node_loader.py         |  2 +-
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 7a761729cf9..2cdb1ee1fd3 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -216,15 +216,42 @@ def __get_edgelist(self):
             ]) for (dst_type,rel_type,src_type) in sorted_keys
         ], axis=1).cuda()
 
-        # FIXME this is not calculated correctly
         edge_type_array = torch.arange(len(sorted_keys), dtype=torch.int32, device='cuda').repeat_interleave(torch.tensor([
             self.__edge_indices[et].shape[1] for et in sorted_keys
         ], device='cuda', dtype=torch.int32))
 
-        edge_id_array = torch.concat([
-            torch.arange(self.__edge_indices[et].shape[1], dtype=torch.int64, device='cuda')
-            for et in sorted_keys
-        ])
+        if self.is_multi_gpu:
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+
+            num_edges_t = torch.tensor(
+                [
+                    self.__edge_indices[et].shape[1]
+                    for et in sorted_keys
+                ],
+                device='cuda'
+            )
+            num_edges_all_t = torch.empty(world_size, num_edges_t.numel(), dtype=torch.int64, device='cuda')
+            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
+
+            if rank > 0:
+                start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
+                edge_id_array = torch.concat([
+                    torch.arange(start_offsets[i], start_offsets[i] + num_edges_all_t[rank][i], dtype=torch.int64, device='cuda')
+                    for i in range(len(sorted_keys))
+                ])
+            else:
+                edge_id_array = torch.concat([
+                    torch.arange(self.__edge_indices[et].shape[1], dtype=torch.int64, device='cuda')
+                    for et in sorted_keys
+                ])
+            
+        else:
+            # single GPU
+            edge_id_array = torch.concat([
+                torch.arange(self.__edge_indices[et].shape[1], dtype=torch.int64, device='cuda')
+                for et in sorted_keys
+            ])
 
         return {
             'dst': edge_index[0],
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
index e39d9c08c3e..68cbae43467 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -119,7 +119,7 @@ def __iter__(self):
             perm = perm[:-d]
         
         input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
-            input_id=self.__input_data.input_id[perm],
+            input_id=None if self.__input_data.input_id is None else self.__input_data.input_id[perm],
             node=self.__input_data.node[perm],
             time=None if self.__input_data.time is None else self.__input_data.time[perm],
             input_type=self.__input_data.input_type,

From 617524d1f241a07336f677cc754a3bde70faf95e Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 8 May 2024 15:03:38 -0700
Subject: [PATCH 45/80] style

---
 .../all_cuda-118_arch-x86_64.yaml             |   1 +
 .../all_cuda-122_arch-x86_64.yaml             |   1 +
 dependencies.yaml                             |   2 +-
 .../conda/cugraph_pyg_dev_cuda-118.yaml       |   1 +
 .../cugraph-pyg/cugraph_pyg/data/__init__.py  |   7 +-
 .../cugraph_pyg/data/feature_store.py         |  75 ++--
 .../cugraph_pyg/data/graph_store.py           | 227 ++++++-----
 .../cugraph_pyg/examples/gcn_dist_mg.py       | 185 ++++++---
 .../cugraph_pyg/examples/gcn_dist_sg.py       | 106 +++---
 .../cugraph_pyg/loader/__init__.py            |   9 +-
 .../cugraph_pyg/loader/neighbor_loader.py     | 357 +++++++++---------
 .../cugraph_pyg/loader/node_loader.py         | 191 +++++-----
 .../cugraph_pyg/sampler/__init__.py           |   2 +-
 .../cugraph_pyg/sampler/sampler.py            | 237 ++++++++----
 .../cugraph_pyg/sampler/sampler_utils.py      |   1 -
 python/cugraph-pyg/pyproject.toml             |   1 +
 .../cugraph/gnn/data_loading/dist_sampler.py  |  98 +++--
 python/nx-cugraph/README.md                   |  12 +-
 18 files changed, 902 insertions(+), 611 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 659a2b911fb..3679ff30ad5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -69,6 +69,7 @@ dependencies:
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
+- tensordict
 - ucx-proc=*=gpu
 - ucx-py==0.38.*
 - wget
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 377e4151f9b..0fa09fd4742 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -74,6 +74,7 @@ dependencies:
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
+- tensordict
 - ucx-proc=*=gpu
 - ucx-py==0.38.*
 - wget
diff --git a/dependencies.yaml b/dependencies.yaml
index 7cf27bccaaa..cd0e95d2a08 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -481,7 +481,7 @@ dependencies:
         packages:
           - *numba
           - *numpy
-          - *tensordict
+          - tensordict
       - output_types: [pyproject]
         packages:
           - *cugraph
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 94e9f1decbd..c13c854dbcc 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -21,4 +21,5 @@ dependencies:
 - pytorch-cuda==11.8
 - pytorch>=2.0
 - scipy
+- tensordict>=0.1.2
 name: cugraph_pyg_dev_cuda-118
diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
index fd87e9da2c0..c2ff510821c 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,8 +17,7 @@
 from cugraph_pyg.data.graph_store import GraphStore
 from cugraph_pyg.data.feature_store import TensorDictFeatureStore
 
+
 def CuGraphStore(*args, **kwargs):
-    warnings.warn(
-        "CuGraphStore has been renamed to DaskGraphStore"
-    )
+    warnings.warn("CuGraphStore has been renamed to DaskGraphStore")
     return DaskGraphStore(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index 7f6543580ff..20a9ecdc359 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -17,11 +17,16 @@
 
 from cugraph.utilities.utils import import_optional, MissingModule
 
-torch = import_optional('torch')
-torch_geometric = import_optional('torch_geometric')
-tensordict = import_optional('tensordict')
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+tensordict = import_optional("tensordict")
 
-class TensorDictFeatureStore(object if isinstance(torch_geometric, MissingModule) else torch_geometric.data.FeatureStore):
+
+class TensorDictFeatureStore(
+    object
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.data.FeatureStore
+):
     """
     A basic implementation of the PyG FeatureStore interface that stores
     feature data in a single TensorDict.  This type of feature store is
@@ -33,60 +38,84 @@ def __init__(self):
         super().__init__()
 
         self.__features = {}
-    
-    def _put_tensor(self, tensor:'torch_geometric.typing.FeatureTensorType', attr: 'torch_geometric.data.feature_store.TensorAttr') ->bool:
+
+    def _put_tensor(
+        self,
+        tensor: "torch_geometric.typing.FeatureTensorType",
+        attr: "torch_geometric.data.feature_store.TensorAttr",
+    ) -> bool:
         if attr.group_name in self.__features:
             td = self.__features[attr.group_name]
             batch_size = td.batch_size[0]
 
-            if attr.is_set('index'):
+            if attr.is_set("index"):
                 if attr.attr_name in td.keys():
                     if attr.index.shape[0] != batch_size:
                         raise ValueError(
-                            f"Leading size of index tensor does not match existing tensors for group name {attr.group_name};"
-                            f" Expected {batch_size}, got {attr.index.shape[0]}"
+                            "Leading size of index tensor "
+                            "does not match existing tensors for group name "
+                            f"{attr.group_name}; Expected {batch_size}, "
+                            f"got {attr.index.shape[0]}"
                         )
                     td[attr.attr_name][attr.index] = tensor
                     return True
                 else:
-                    warnings.warn(f"Ignoring index parameter (attribute does not exist for group {attr.group_name})")
+                    warnings.warn(
+                        "Ignoring index parameter "
+                        f"(attribute does not exist for group {attr.group_name})"
+                    )
 
             if tensor.shape[0] != batch_size:
                 raise ValueError(
-                    f"Leading size of input tensor does not match existing tensors for group name {attr.group_name};"
+                    "Leading size of input tensor does not match "
+                    f"existing tensors for group name {attr.group_name};"
                     f" Expected {batch_size}, got {tensor.shape[0]}"
                 )
         else:
             batch_size = tensor.shape[0]
-            self.__features[attr.group_name] = tensordict.TensorDict({}, batch_size=batch_size)
+            self.__features[attr.group_name] = tensordict.TensorDict(
+                {}, batch_size=batch_size
+            )
 
         self.__features[attr.group_name][attr.attr_name] = tensor
         return True
-    
-    def _get_tensor(self, attr: 'torch_geometric.data.feature_store.TensorAttr') -> Optional['torch_geometric.typing.FeatureTensorType']:
+
+    def _get_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
         if attr.group_name not in self.__features:
             return None
-    
+
         if attr.attr_name not in self.__features[attr.group_name].keys():
             return None
-    
+
         tensor = self.__features[attr.group_name][attr.attr_name]
-        return tensor if (attr.index is None or (not attr.is_set('index'))) else tensor[attr.index]
-        
-    def _remove_tensor(self, attr: 'torch_geometric.data.feature_store.TensorAttr') -> bool:
+        return (
+            tensor
+            if (attr.index is None or (not attr.is_set("index")))
+            else tensor[attr.index]
+        )
+
+    def _remove_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> bool:
         if attr.group_name not in self.__features:
             return False
-    
+
         if attr.attr_name not in self.__features[attr.group_name]:
             return False
-    
+
         del self.__features[attr.group_name][attr.attr_name]
         return True
 
-    def _get_tensor_size(self, attr: 'torch_geometric.data.feature_store.TensorAttr') -> Tuple:
+    def _get_tensor_size(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Tuple:
         return self._get_tensor(attr).size()
 
-    def get_all_tensor_attrs(self) -> List['torch_geometric.data.feature_store.TensorAttr']:
+    def get_all_tensor_attrs(
+        self,
+    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
         attrs = []
         for group_name, td in self.__features.items():
             for attr_name in td.keys():
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 2cdb1ee1fd3..1ba712a4dbd 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -26,20 +26,25 @@
 
 # Have to use import_optional even though these are required
 # dependencies in order to build properly.
-torch_geometric = import_optional('torch_geometric')
-torch = import_optional('torch')
-tensordict = import_optional('tensordict')
+torch_geometric = import_optional("torch_geometric")
+torch = import_optional("torch")
+tensordict = import_optional("tensordict")
 
-TensorType = Union['torch.Tensor', cupy.ndarray, np.ndarray, cudf.Series, pandas.Series]
+TensorType = Union["torch.Tensor", cupy.ndarray, np.ndarray, cudf.Series, pandas.Series]
 
-class GraphStore(object if isinstance(torch_geometric, MissingModule) else torch_geometric.data.GraphStore):
+
+class GraphStore(
+    object
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.data.GraphStore
+):
     """
     This object uses lazy graph creation.  Users can repeatedly call
     put_edge_index, and the tensors won't be converted into a cuGraph
     graph until one is needed (i.e. when creating a loader).
     """
 
-    def __init__(self, is_multi_gpu:bool=False):
+    def __init__(self, is_multi_gpu: bool = False):
         self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
         self.__sizes = {}
         self.__graph = None
@@ -49,20 +54,26 @@ def __init__(self, is_multi_gpu:bool=False):
 
         super().__init__()
 
-    def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', edge_attr:'torch_geometric.data.EdgeAttr') ->bool:
+    def _put_edge_index(
+        self,
+        edge_index: "torch_geometric.typing.EdgeTensorType",
+        edge_attr: "torch_geometric.data.EdgeAttr",
+    ) -> bool:
         if edge_attr.layout != torch_geometric.data.graph_store.EdgeLayout.COO:
             raise ValueError("Only COO format supported")
 
         if isinstance(edge_index, (cupy.ndarray, cudf.Series)):
-            edge_index = torch.as_tensor(edge_index, device='cuda')
+            edge_index = torch.as_tensor(edge_index, device="cuda")
         elif isinstance(edge_index, (np.ndarray)):
-            edge_index = torch.as_tensor(edge_index, device='cpu')
+            edge_index = torch.as_tensor(edge_index, device="cpu")
         elif isinstance(edge_index, pandas.Series):
-            edge_index = torch.as_tensor(edge_index.values, device='cpu')
+            edge_index = torch.as_tensor(edge_index.values, device="cpu")
         elif isinstance(edge_index, cudf.Series):
-            edge_index = torch.as_tensor(edge_index.values, device='cuda')
-        
-        self.__edge_indices[edge_attr.edge_type] = torch.stack([edge_index[0], edge_index[1]])
+            edge_index = torch.as_tensor(edge_index.values, device="cuda")
+
+        self.__edge_indices[edge_attr.edge_type] = torch.stack(
+            [edge_index[0], edge_index[1]]
+        )
         self.__sizes[edge_attr.edge_type] = edge_attr.size
 
         # invalidate the graph
@@ -70,38 +81,34 @@ def _put_edge_index(self, edge_index:'torch_geometric.typing.EdgeTensorType', ed
         self.__vertex_offsets = None
         return True
 
-    def _get_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->Optional['torch_geometric.typing.EdgeTensorType']:
-        ei = torch_geometric.EdgeIndex(
-            self.__edge_indices[edge_attr.edge_type]
-        )
-        
-        
-        if edge_attr.layout == 'csr':
-            return ei.sort_by('row').values.get_csr()
-        elif edge_attr.layout == 'csc':
-            return ei.sort_by('col').values.get_csc()
+    def _get_edge_index(
+        self, edge_attr: "torch_geometric.data.EdgeAttr"
+    ) -> Optional["torch_geometric.typing.EdgeTensorType"]:
+        ei = torch_geometric.EdgeIndex(self.__edge_indices[edge_attr.edge_type])
+
+        if edge_attr.layout == "csr":
+            return ei.sort_by("row").values.get_csr()
+        elif edge_attr.layout == "csc":
+            return ei.sort_by("col").values.get_csc()
 
         return ei
 
-    def _remove_edge_index(self, edge_attr:'torch_geometric.data.EdgeAttr')->bool:
+    def _remove_edge_index(self, edge_attr: "torch_geometric.data.EdgeAttr") -> bool:
         del self.__edge_indices[edge_attr.edge_type]
-        
+
         # invalidate the graph
         self.__graph = None
         return True
 
-    def get_all_edge_attrs(self) -> List['torch_geometric.data.EdgeAttr']:
+    def get_all_edge_attrs(self) -> List["torch_geometric.data.EdgeAttr"]:
         attrs = []
         for et in self.__edge_indices.keys(leaves_only=True, include_nested=True):
             attrs.append(
                 torch_geometric.data.EdgeAttr(
-                    edge_type=et,
-                    layout='coo',
-                    is_sorted=False,
-                    size=self.__sizes[et]
+                    edge_type=et, layout="coo", is_sorted=False, size=self.__sizes[et]
                 )
             )
-        
+
         return attrs
 
     @property
@@ -122,8 +129,7 @@ def _resource_handle(self):
     @property
     def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
         graph_properties = pylibcugraph.GraphProperties(
-            is_multigraph=True,
-            is_symmetric=False
+            is_multigraph=True, is_symmetric=False
         )
 
         if self.__graph is None:
@@ -133,48 +139,64 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                 rank = torch.distributed.get_rank()
                 world_size = torch.distributed.get_world_size()
 
-                vertices_array=cupy.arange(sum(self._num_vertices().values()), dtype='int64')
+                vertices_array = cupy.arange(
+                    sum(self._num_vertices().values()), dtype="int64"
+                )
                 vertices_array = cupy.array_split(vertices_array, world_size)[rank]
-               
+
                 self.__graph = pylibcugraph.MGGraph(
                     self._resource_handle,
                     graph_properties,
-                    [cupy.asarray(edgelist_dict['src'])],
-                    [cupy.asarray(edgelist_dict['dst'])],
+                    [cupy.asarray(edgelist_dict["src"])],
+                    [cupy.asarray(edgelist_dict["dst"])],
                     vertices_array=[vertices_array],
-                    edge_id_array=[cupy.asarray(edgelist_dict['eid'])],
-                    edge_type_array=[cupy.asarray(edgelist_dict['etp'])],
+                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
+                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
                 )
             else:
                 self.__graph = pylibcugraph.SGGraph(
                     self._resource_handle,
                     graph_properties,
-                    cupy.asarray(edgelist_dict['src']),
-                    cupy.asarray(edgelist_dict['dst']),
-                    vertices_array=cupy.arange(sum(self._num_vertices.values()), dtype='int64'),
-                    edge_id_array=cupy.asarray(edgelist_dict['eid']),
-                    edge_type_array=cupy.asarray(edgelist_dict['etp']),
+                    cupy.asarray(edgelist_dict["src"]),
+                    cupy.asarray(edgelist_dict["dst"]),
+                    vertices_array=cupy.arange(
+                        sum(self._num_vertices.values()), dtype="int64"
+                    ),
+                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
+                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
                 )
-        
+
         return self.__graph
 
     def _num_vertices(self) -> Dict[str, int]:
         num_vertices = {}
         for edge_attr in self.get_all_edge_attrs():
             if edge_attr.size is not None:
-                num_vertices[edge_attr.edge_type[0]] = max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0]) if edge_attr.edge_type[0] in num_vertices else edge_attr.size[0]
-                num_vertices[edge_attr.edge_type[2]] = max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1]) if edge_attr.edge_type[2] in num_vertices else edge_attr.size[1]
+                num_vertices[edge_attr.edge_type[0]] = (
+                    max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0])
+                    if edge_attr.edge_type[0] in num_vertices
+                    else edge_attr.size[0]
+                )
+                num_vertices[edge_attr.edge_type[2]] = (
+                    max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1])
+                    if edge_attr.edge_type[2] in num_vertices
+                    else edge_attr.size[1]
+                )
             else:
                 if edge_attr.edge_type[0] not in num_vertices:
-                    num_vertices[edge_attr.edge_type[0]] = self.__edge_indices[edge_attr.edge_type][0].max() + 1
+                    num_vertices[edge_attr.edge_type[0]] = (
+                        self.__edge_indices[edge_attr.edge_type][0].max() + 1
+                    )
                 if edge_attr.edge_type[2] not in num_vertices:
-                    num_vertices[edge_attr.edge_type[1]] = self.__edge_indices[edge_attr.edge_type][1].max() + 1
-        
+                    num_vertices[edge_attr.edge_type[1]] = (
+                        self.__edge_indices[edge_attr.edge_type][1].max() + 1
+                    )
+
         return num_vertices
 
     @property
     def _vertex_offsets(self) -> Dict[str, int]:
-        if self.__vertex_offsets is None:            
+        if self.__vertex_offsets is None:
             num_vertices = self._num_vertices()
             ordered_keys = sorted(list(num_vertices.keys()))
             self.__vertex_offsets = {}
@@ -203,61 +225,92 @@ def __get_edgelist(self):
             etp: edge types for each edge (int32)
                 Note that these are in lexicographic order.
         """
-        sorted_keys = sorted(list(self.__edge_indices.keys(leaves_only=True,include_nested=True)))
+        sorted_keys = sorted(
+            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
+        )
 
         # note that this still follows the PyG convention of (dst, rel, src)
         # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
         # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
         # and (paper 1) -> (author 0)
-        edge_index = torch.concat([
-            torch.stack([
-                self.__edge_indices[dst_type,rel_type,src_type][0] + self._vertex_offsets[dst_type],
-                self.__edge_indices[dst_type,rel_type,src_type][1] + self._vertex_offsets[src_type],
-            ]) for (dst_type,rel_type,src_type) in sorted_keys
-        ], axis=1).cuda()
-
-        edge_type_array = torch.arange(len(sorted_keys), dtype=torch.int32, device='cuda').repeat_interleave(torch.tensor([
-            self.__edge_indices[et].shape[1] for et in sorted_keys
-        ], device='cuda', dtype=torch.int32))
+        edge_index = torch.concat(
+            [
+                torch.stack(
+                    [
+                        self.__edge_indices[dst_type, rel_type, src_type][0]
+                        + self._vertex_offsets[dst_type],
+                        self.__edge_indices[dst_type, rel_type, src_type][1]
+                        + self._vertex_offsets[src_type],
+                    ]
+                )
+                for (dst_type, rel_type, src_type) in sorted_keys
+            ],
+            axis=1,
+        ).cuda()
+
+        edge_type_array = torch.arange(
+            len(sorted_keys), dtype=torch.int32, device="cuda"
+        ).repeat_interleave(
+            torch.tensor(
+                [self.__edge_indices[et].shape[1] for et in sorted_keys],
+                device="cuda",
+                dtype=torch.int32,
+            )
+        )
 
         if self.is_multi_gpu:
             rank = torch.distributed.get_rank()
             world_size = torch.distributed.get_world_size()
 
             num_edges_t = torch.tensor(
-                [
-                    self.__edge_indices[et].shape[1]
-                    for et in sorted_keys
-                ],
-                device='cuda'
+                [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
+            )
+            num_edges_all_t = torch.empty(
+                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
             )
-            num_edges_all_t = torch.empty(world_size, num_edges_t.numel(), dtype=torch.int64, device='cuda')
             torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
 
             if rank > 0:
                 start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
-                edge_id_array = torch.concat([
-                    torch.arange(start_offsets[i], start_offsets[i] + num_edges_all_t[rank][i], dtype=torch.int64, device='cuda')
-                    for i in range(len(sorted_keys))
-                ])
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            start_offsets[i],
+                            start_offsets[i] + num_edges_all_t[rank][i],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for i in range(len(sorted_keys))
+                    ]
+                )
             else:
-                edge_id_array = torch.concat([
-                    torch.arange(self.__edge_indices[et].shape[1], dtype=torch.int64, device='cuda')
-                    for et in sorted_keys
-                ])
-            
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            self.__edge_indices[et].shape[1],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for et in sorted_keys
+                    ]
+                )
+
         else:
             # single GPU
-            edge_id_array = torch.concat([
-                torch.arange(self.__edge_indices[et].shape[1], dtype=torch.int64, device='cuda')
-                for et in sorted_keys
-            ])
+            edge_id_array = torch.concat(
+                [
+                    torch.arange(
+                        self.__edge_indices[et].shape[1],
+                        dtype=torch.int64,
+                        device="cuda",
+                    )
+                    for et in sorted_keys
+                ]
+            )
 
         return {
-            'dst': edge_index[0],
-            'src': edge_index[1],
-            'etp': edge_type_array,
-            'eid': edge_id_array,
+            "dst": edge_index[0],
+            "src": edge_index[1],
+            "etp": edge_type_array,
+            "eid": edge_id_array,
         }
-
-        
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
index e56dc5e190b..dfb49210143 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
@@ -1,9 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import os
 import tempfile
 import time
 
-import numpy as np
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -21,36 +33,57 @@
 
 # Allow computation on objects that are larger than GPU memory
 # https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ['CUDF_SPILL'] = '1'
+os.environ["CUDF_SPILL"] = "1"
 
 # Ensures that a CUDA context is not created on import of rapids.
 # Allows pytorch to create the context instead
-os.environ['RAPIDS_NO_INITIALIZE'] = '1'
+os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+
 
 def init_pytorch_worker(rank, world_size, cugraph_id):
     import rmm
-    rmm.reinitialize(devices=rank, managed_memory=True, pool_allocator=True,)
+
+    rmm.reinitialize(
+        devices=rank,
+        managed_memory=True,
+        pool_allocator=True,
+    )
 
     import cupy
+
     cupy.cuda.Device(rank).use()
     from rmm.allocators.cupy import rmm_cupy_allocator
+
     cupy.cuda.set_allocator(rmm_cupy_allocator)
 
     from cugraph.testing.mg_utils import enable_spilling
+
     enable_spilling()
 
     torch.cuda.set_device(rank)
 
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '12355'
-    dist.init_process_group('nccl', rank=rank, world_size=world_size)
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
 
     cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
 
 
-def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan_out,
-              split_idx, num_classes, wall_clock_start, tempdir=None,
-              num_layers=3):
+def run_train(
+    rank,
+    data,
+    world_size,
+    cugraph_id,
+    model,
+    epochs,
+    batch_size,
+    fan_out,
+    split_idx,
+    num_classes,
+    wall_clock_start,
+    tempdir=None,
+    num_layers=3,
+):
 
     init_pytorch_worker(
         rank,
@@ -60,8 +93,7 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
 
     model = model.to(rank)
     model = DistributedDataParallel(model, device_ids=[rank])
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01,
-                                 weight_decay=0.0005)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
 
     kwargs = dict(
         num_neighbors=[fan_out] * num_layers,
@@ -73,16 +105,18 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
 
     graph_store = GraphStore(is_multi_gpu=True)
     ixr = torch.tensor_split(data.edge_index, world_size, axis=1)[rank]
-    graph_store[('node','rel','node'), 'coo', False, (data.num_nodes, data.num_nodes)] = ixr
+    graph_store[
+        ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
+    ] = ixr
 
     feature_store = TensorDictFeatureStore()
-    feature_store['node', 'x'] = data.x
-    feature_store['node', 'y'] = data.y
+    feature_store["node", "x"] = data.x
+    feature_store["node", "y"] = data.y
 
     dist.barrier()
 
-    ix_train = torch.tensor_split(split_idx['train'], world_size)[rank].cuda()
-    train_path = os.path.join(tempdir, f'train_{rank}')
+    ix_train = torch.tensor_split(split_idx["train"], world_size)[rank].cuda()
+    train_path = os.path.join(tempdir, f"train_{rank}")
     os.mkdir(train_path)
     train_loader = NeighborLoader(
         (feature_store, graph_store),
@@ -90,11 +124,11 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
         directory=train_path,
         shuffle=True,
         drop_last=True,
-        **kwargs
+        **kwargs,
     )
 
-    ix_test = torch.tensor_split(split_idx['test'], world_size)[rank].cuda()
-    test_path = os.path.join(tempdir, f'test_{rank}')
+    ix_test = torch.tensor_split(split_idx["test"], world_size)[rank].cuda()
+    test_path = os.path.join(tempdir, f"test_{rank}")
     os.mkdir(test_path)
     test_loader = NeighborLoader(
         (feature_store, graph_store),
@@ -103,11 +137,11 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
         shuffle=True,
         drop_last=True,
         local_seeds_per_call=80000,
-        **kwargs
+        **kwargs,
     )
 
-    ix_valid = torch.tensor_split(split_idx['valid'], world_size)[rank].cuda()
-    valid_path = os.path.join(tempdir, f'valid_{rank}')
+    ix_valid = torch.tensor_split(split_idx["valid"], world_size)[rank].cuda()
+    valid_path = os.path.join(tempdir, f"valid_{rank}")
     os.mkdir(valid_path)
     valid_loader = NeighborLoader(
         (feature_store, graph_store),
@@ -115,7 +149,7 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
         directory=valid_path,
         shuffle=True,
         drop_last=True,
-        **kwargs
+        **kwargs,
     )
 
     dist.barrier()
@@ -124,18 +158,17 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
     warmup_steps = 20
     dist.barrier()
     torch.cuda.synchronize()
-    
+
     if rank == 0:
         prep_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total time before training begins (prep_time) =", prep_time,
-              "seconds")
+        print("Total time before training begins (prep_time) =", prep_time, "seconds")
         print("Beginning training...")
     for epoch in range(epochs):
         for i, batch in enumerate(train_loader):
             if i == warmup_steps:
                 torch.cuda.synchronize()
                 start = time.time()
-            
+
             batch = batch.to(rank)
             batch_size = batch.batch_size
 
@@ -146,13 +179,22 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
             loss.backward()
             optimizer.step()
             if rank == 0 and i % 10 == 0:
-                print("Epoch: " + str(epoch) + ", Iteration: " + str(i) +
-                        ", Loss: " + str(loss))
+                print(
+                    "Epoch: "
+                    + str(epoch)
+                    + ", Iteration: "
+                    + str(i)
+                    + ", Loss: "
+                    + str(loss)
+                )
         nb = i + 1.0
-        
+
         if rank == 0:
-            print("Average Training Iteration Time:",
-                  (time.time() - start) / (nb - warmup_steps), "s/iter")
+            print(
+                "Average Training Iteration Time:",
+                (time.time() - start) / (nb - warmup_steps),
+                "s/iter",
+            )
 
         with torch.no_grad():
             total_correct = total_examples = 0
@@ -165,7 +207,7 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
 
                 batch.y = batch.y.to(torch.long)
                 out = model(batch.x, batch.edge_index)[:batch_size]
-                
+
                 pred = out.argmax(dim=-1)
                 y = batch.y[:batch_size].view(-1).to(torch.long)
 
@@ -174,8 +216,10 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
 
             acc_val = total_correct / total_examples
             if rank == 0:
-                print(f"Validation Accuracy: {acc_val * 100.0:.4f}%", )
-        
+                print(
+                    f"Validation Accuracy: {acc_val * 100.0:.4f}%",
+                )
+
         torch.cuda.synchronize()
 
     with torch.no_grad():
@@ -195,8 +239,10 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
 
         acc_test = total_correct / total_examples
         if rank == 0:
-            print(f"Test Accuracy: {acc_test * 100.0:.4f}%", )
-    #dist.barrier()
+            print(
+                f"Test Accuracy: {acc_test * 100.0:.4f}%",
+            )
+    # dist.barrier()
 
     if rank == 0:
         total_time = round(time.perf_counter() - wall_clock_start, 2)
@@ -205,47 +251,49 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
 
     cugraph_comms_shutdown()
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--hidden_channels', type=int, default=256)
-    parser.add_argument('--num_layers', type=int, default=2)
-    parser.add_argument('--lr', type=float, default=0.001)
-    parser.add_argument('--epochs', type=int, default=4)
-    parser.add_argument('--batch_size', type=int, default=1024)
-    parser.add_argument('--fan_out', type=int, default=30)
-    parser.add_argument('--tempdir_root', type=str, default=None)
-    parser.add_argument('--dataset_root', type=str, default='dataset')
-    parser.add_argument('--dataset', type=str, default='ogbn-products')
+    parser.add_argument("--hidden_channels", type=int, default=256)
+    parser.add_argument("--num_layers", type=int, default=2)
+    parser.add_argument("--lr", type=float, default=0.001)
+    parser.add_argument("--epochs", type=int, default=4)
+    parser.add_argument("--batch_size", type=int, default=1024)
+    parser.add_argument("--fan_out", type=int, default=30)
+    parser.add_argument("--tempdir_root", type=str, default=None)
+    parser.add_argument("--dataset_root", type=str, default="dataset")
+    parser.add_argument("--dataset", type=str, default="ogbn-products")
 
     parser.add_argument(
-        "--n_devices", type=int, default=-1,
-        help="1-8 to use that many GPUs. Defaults to all available GPUs")
+        "--n_devices",
+        type=int,
+        default=-1,
+        help="1-8 to use that many GPUs. Defaults to all available GPUs",
+    )
 
     args = parser.parse_args()
     wall_clock_start = time.perf_counter()
 
     from rmm.allocators.torch import rmm_torch_allocator
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
-    dataset = PygNodePropPredDataset(name=args.dataset,
-                                     root=args.dataset_root)
+    dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
     split_idx = dataset.get_idx_split()
     data = dataset[0]
     data.y = data.y.reshape(-1)
 
-    model = torch_geometric.nn.models.GCN(dataset.num_features,
-                                            args.hidden_channels,
-                                            args.num_layers,
-                                            dataset.num_classes)
+    model = torch_geometric.nn.models.GCN(
+        dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes
+    )
 
     print("Data =", data)
     if args.n_devices == -1:
         world_size = torch.cuda.device_count()
     else:
         world_size = args.n_devices
-    print('Let\'s use', world_size, 'GPUs!')
+    print("Let's use", world_size, "GPUs!")
 
     # Create the uid needed for cuGraph comms
     cugraph_id = cugraph_comms_create_unique_id()
@@ -253,7 +301,20 @@ def run_train(rank, data, world_size, cugraph_id, model, epochs, batch_size, fan
     with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
         mp.spawn(
             run_train,
-            args=(data, world_size, cugraph_id, model, args.epochs, args.batch_size,
-                    args.fan_out, split_idx, dataset.num_classes,
-                    wall_clock_start, tempdir, args.num_layers),
-            nprocs=world_size, join=True)
\ No newline at end of file
+            args=(
+                data,
+                world_size,
+                cugraph_id,
+                model,
+                args.epochs,
+                args.batch_size,
+                args.fan_out,
+                split_idx,
+                dataset.num_classes,
+                wall_clock_start,
+                tempdir,
+                args.num_layers,
+            ),
+            nprocs=world_size,
+            join=True,
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
index 550b9a600a7..71b0e4bb2fb 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 import argparse
 import tempfile
@@ -19,90 +32,93 @@
 cupy.cuda.set_allocator(rmm_cupy_allocator)
 torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
-import torch.nn.functional as F
-import torch_geometric
-import cugraph_pyg
-from cugraph_pyg.loader import NeighborLoader
+import torch.nn.functional as F  # noqa: E402
+import torch_geometric  # noqa: E402
+import cugraph_pyg  # noqa: E402
+from cugraph_pyg.loader import NeighborLoader  # noqa: E402
 
 # Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling
+from cugraph.testing.mg_utils import enable_spilling  # noqa: E402
+
 enable_spilling()
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hidden_channels', type=int, default=256)
-parser.add_argument('--num_layers', type=int, default=2)
-parser.add_argument('--lr', type=float, default=0.001)
-parser.add_argument('--epochs', type=int, default=4)
-parser.add_argument('--batch_size', type=int, default=1024)
-parser.add_argument('--fan_out', type=int, default=30)
-parser.add_argument('--tempdir_root', type=str, default=None)
-parser.add_argument('--dataset_root', type=str, default='dataset')
-parser.add_argument('--dataset', type=str, default='ogbn-products')
+parser.add_argument("--hidden_channels", type=int, default=256)
+parser.add_argument("--num_layers", type=int, default=2)
+parser.add_argument("--lr", type=float, default=0.001)
+parser.add_argument("--epochs", type=int, default=4)
+parser.add_argument("--batch_size", type=int, default=1024)
+parser.add_argument("--fan_out", type=int, default=30)
+parser.add_argument("--tempdir_root", type=str, default=None)
+parser.add_argument("--dataset_root", type=str, default="dataset")
+parser.add_argument("--dataset", type=str, default="ogbn-products")
 
 args = parser.parse_args()
 
 wall_clock_start = time.perf_counter()
-device = torch.device('cuda')
+device = torch.device("cuda")
 
-from ogb.nodeproppred import PygNodePropPredDataset
-dataset = PygNodePropPredDataset(name=args.dataset,
-                                 root=args.dataset_root)
+from ogb.nodeproppred import PygNodePropPredDataset  # noqa: E402
+
+dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
 split_idx = dataset.get_idx_split()
 data = dataset[0]
 
 graph_store = cugraph_pyg.data.GraphStore()
-graph_store[('node','rel','node'), 'coo', False, (data.num_nodes, data.num_nodes)] = data.edge_index
+graph_store[
+    ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
+] = data.edge_index
 
 feature_store = cugraph_pyg.data.TensorDictFeatureStore()
-feature_store['node', 'x'] = data.x
-feature_store['node', 'y'] = data.y
+feature_store["node", "x"] = data.x
+feature_store["node", "y"] = data.y
 
 with tempfile.TemporaryDirectory(dir=args.tempdir_root) as samples_dir:
-    train_dir = os.path.join(samples_dir, 'train')
+    train_dir = os.path.join(samples_dir, "train")
     os.mkdir(train_dir)
     train_loader = NeighborLoader(
         data=(feature_store, graph_store),
-        num_neighbors=[args.fan_out]*args.num_layers,
-        input_nodes=split_idx['train'],
+        num_neighbors=[args.fan_out] * args.num_layers,
+        input_nodes=split_idx["train"],
         replace=False,
         batch_size=args.batch_size,
         directory=train_dir,
     )
 
-    val_dir = os.path.join(samples_dir, 'val')
+    val_dir = os.path.join(samples_dir, "val")
     os.mkdir(val_dir)
     val_loader = NeighborLoader(
         data=(feature_store, graph_store),
-        num_neighbors=[args.fan_out]*args.num_layers,
-        input_nodes=split_idx['valid'],
+        num_neighbors=[args.fan_out] * args.num_layers,
+        input_nodes=split_idx["valid"],
         replace=False,
         batch_size=args.batch_size,
         directory=val_dir,
     )
 
-    test_dir = os.path.join(samples_dir, 'test')
+    test_dir = os.path.join(samples_dir, "test")
     os.mkdir(test_dir)
     test_loader = NeighborLoader(
         data=(feature_store, graph_store),
-        num_neighbors=[args.fan_out]*args.num_layers,
-        input_nodes=split_idx['test'],
+        num_neighbors=[args.fan_out] * args.num_layers,
+        input_nodes=split_idx["test"],
         replace=False,
         batch_size=args.batch_size,
         directory=test_dir,
     )
 
     model = torch_geometric.nn.models.GCN(
-            dataset.num_features,
-            args.hidden_channels,
-            args.num_layers,
-            dataset.num_classes,
-        ).to(device)
+        dataset.num_features,
+        args.hidden_channels,
+        args.num_layers,
+        dataset.num_classes,
+    ).to(device)
 
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr,
-                                weight_decay=0.0005)
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005)
 
     warmup_steps = 20
-    def train(epoch:int):
+
+    def train(epoch: int):
         model.train()
         for i, batch in enumerate(train_loader):
             if i == warmup_steps:
@@ -120,10 +136,12 @@ def train(epoch:int):
             optimizer.step()
 
             if i % 10 == 0:
-                print(f'Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}')
+                print(f"Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}")
         torch.cuda.synchronize()
-        print(f'Average Training Iteration Time (s/iter): \
-                {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}')
+        print(
+            f"Average Training Iteration Time (s/iter): \
+                {(time.perf_counter() - start_avg_time)/(i-warmup_steps):.6f}"
+        )
 
     @torch.no_grad()
     def test(loader: NeighborLoader, val_steps: Optional[int] = None):
@@ -151,10 +169,10 @@ def test(loader: NeighborLoader, val_steps: Optional[int] = None):
     for epoch in range(1, 1 + args.epochs):
         train(epoch)
         val_acc = test(val_loader, val_steps=100)
-        print(f'Val Acc: ~{val_acc:.4f}')
+        print(f"Val Acc: ~{val_acc:.4f}")
 
     test_acc = test(test_loader)
-    print(f'Test Acc: {test_acc:.4f}')
+    print(f"Test Acc: {test_acc:.4f}")
     total_time = round(time.perf_counter() - wall_clock_start, 2)
     print("Total Program Runtime (total_time) =", total_time, "seconds")
-    print("total_time - prep_time =", total_time - prep_time, "seconds")
\ No newline at end of file
+    print("total_time - prep_time =", total_time - prep_time, "seconds")
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index c17aa97b49a..c8ab46f4205 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,8 +20,7 @@
 
 from cugraph_pyg.loader.dask_node_loader import BulkSampleLoader
 
+
 def CuGraphNeighborLoader(*args, **kwargs):
-    warnings.warn(
-        "CuGraphNeighborLoader has been renamed to DaskNeighborLoader"
-    )
-    return DaskNeighborLoader(*args, **kwargs)
\ No newline at end of file
+    warnings.warn("CuGraphNeighborLoader has been renamed to DaskNeighborLoader")
+    return DaskNeighborLoader(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
index 505a310114b..3d29ee3aca3 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
@@ -23,7 +23,8 @@
 from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
 from cugraph.utilities.utils import import_optional
 
-torch_geometric = import_optional('torch_geometric')
+torch_geometric = import_optional("torch_geometric")
+
 
 class NeighborLoader(NodeLoader):
     """
@@ -32,184 +33,200 @@ class NeighborLoader(NodeLoader):
 
     Duck-typed version of torch_geometric.loader.NeighborLoader
     """
-    def __init__(self,
-        data: Union['torch_geometric.data.Data', 'torch_geometric.data.HeteroData', Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore']],
-        num_neighbors:  Union[List[int], Dict['torch_geometric.typing.EdgeType', List[int]]],
-        input_nodes: 'torch_geometric.typing.InputNodes' = None,
-        input_time: 'torch_geometric.typing.OptTensor' = None,
+
+    def __init__(
+        self,
+        data: Union[
+            "torch_geometric.data.Data",
+            "torch_geometric.data.HeteroData",
+            Tuple[
+                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+            ],
+        ],
+        num_neighbors: Union[
+            List[int], Dict["torch_geometric.typing.EdgeType", List[int]]
+        ],
+        input_nodes: "torch_geometric.typing.InputNodes" = None,
+        input_time: "torch_geometric.typing.OptTensor" = None,
         replace: bool = False,
-        subgraph_type: Union['torch_geometric.typing.SubgraphType', str] = 'directional',
+        subgraph_type: Union[
+            "torch_geometric.typing.SubgraphType", str
+        ] = "directional",
         disjoint: bool = False,
-        temporal_strategy: str = 'uniform',
+        temporal_strategy: str = "uniform",
         time_attr: Optional[str] = None,
         weight_attr: Optional[str] = None,
         transform: Optional[Callable] = None,
         transform_sampler_output: Optional[Callable] = None,
         is_sorted: bool = False,
         filter_per_worker: Optional[bool] = None,
-        neighbor_sampler: Optional['torch_geometric.sampler.NeighborSampler'] = None,
+        neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
         directed: bool = True,  # Deprecated.
-        batch_size: int =16,
-        directory:str=None,
+        batch_size: int = 16,
+        directory: str = None,
         batches_per_partition=256,
-        format:str='parquet',
-        compression:Optional[str]=None,
-        local_seeds_per_call: Optional[int]=None,
-        **kwargs,):
-            """
-            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-                See torch_geometric.loader.NeighborLoader.
-            num_neighbors: List[int] or Dict[EdgeType, List[int]]
-                Fanout values.
-                See torch_geometric.loader.NeighborLoader.
-            input_nodes: InputNodes
-                Input nodes for sampling.
-                See torch_geometric.loader.NeighborLoader.
-            input_time: OptTensor (optional)
-                See torch_geometric.loader.NeighborLoader.
-            replace: bool (optional, default=False)
-                Whether to sample with replacement.
-                See torch_geometric.loader.NeighborLoader.
-            subgraph_type: Union[SubgraphType, str] (optional, default='directional')
-                The type of subgraph to return.
-                Currently only 'directional' is supported.
-                See torch_geometric.loader.NeighborLoader.
-            disjoint: bool (optional, default=False)
-                Whether to perform disjoint sampling.
-                Currently unsupported.
-                See torch_geometric.loader.NeighborLoader.
-            temporal_strategy: str (optional, default='uniform')
-                Currently only 'uniform' is suppported.
-                See torch_geometric.loader.NeighborLoader.
-            time_attr: str (optional, default=None)
-                Used for temporal sampling.
-                See torch_geometric.loader.NeighborLoader.
-            weight_attr: str (optional, default=None)
-                Used for biased sampling.
-                See torch_geometric.loader.NeighborLoader.
-            transform: Callable (optional, default=None)
-                See torch_geometric.loader.NeighborLoader.
-            transform_sampler_output: Callable (optional, default=None)
-                See torch_geometric.loader.NeighborLoader.
-            is_sorted: bool (optional, default=False)
-                Ignored by cuGraph.
-                See torch_geometric.loader.NeighborLoader.
-            filter_per_worker: bool (optional, default=False)
-                Currently ignored by cuGraph, but this may
-                change once in-memory sampling is implemented.
-                See torch_geometric.loader.NeighborLoader.
-            neighbor_sampler: torch_geometric.sampler.NeighborSampler (optional, default=None)
-                Not supported by cuGraph.
-                See torch_geometric.loader.NeighborLoader.
-            directed: bool (optional, default=True)
-                Deprecated.
-                See torch_geometric.loader.NeighborLoader.
-            batch_size: int (optional, default=16)
-                The number of input nodes per output minibatch.
-                See torch.utils.dataloader.
-            directory: str (optional, default=None)
-                The directory where samples will be temporarily stored.
-                It is recommend that this be set by the user, usually
-                setting it to a tempfile.TemporaryDirectory with a context
-                manager is a good option but depending on the filesystem,
-                you may want to choose an alternative location with fast I/O
-                intead.
-                If not set, this will create a TemporaryDirectory that will
-                persist until this object is garbage collected.
-                See cugraph.gnn.DistSampleWriter.
-            batches_per_partition: int (optional, default=256)
-                The number of batches per partition if writing samples to
-                disk.  Manually tuning this parameter is not recommended
-                but reducing it may help conserve GPU memory.
-                See cugraph.gnn.DistSampleWriter.
-            format: str (optional, default='parquet')
-                If writing samples to disk, they will be written in this
-                file format.
-                See cugraph.gnn.DistSampleWriter.
-            compression: str (optional, default=None)
-                The compression type to use if writing samples to disk.
-                If not provided, it is automatically chosen.
-            local_seeds_per_call: int (optional, default=None)
-                The number of seeds to process within a single sampling call.
-                Manually tuning this parameter is not recommended but reducing
-                it may conserve GPU memory.  The total number of seeds processed
-                per sampling call is equal to the sum of this parameter across
-                all workers.  If not provided, it will be automatically
-                calculated.
-                See cugraph.gnn.DistSampler.
-            **kwargs
-                Other keyword arguments passed to the superclass.
-            """
-
-            subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
-
-            if not directed:
-                subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
-                warnings.warn(
-                     "The 'directed' argument is deprecated. "
-                     "Use subgraph_type='induced' instead."
-                )
-            if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
-                raise ValueError("Only directional subgraphs are currently supported")
-            if disjoint:
-                raise ValueError("Disjoint sampling is currently unsupported")
-            if temporal_strategy != 'uniform':
-                warnings.warn('Only the uniform temporal strategy is currently supported')
-            if neighbor_sampler is not None:
-                raise ValueError("Passing a neighbor sampler is currently unsupported")
-            if time_attr is not None:
-                raise ValueError("Temporal sampling is currently unsupported")
-            if weight_attr is not None:
-                raise ValueError("Biased sampling is currently unsupported")
-            if is_sorted:
-                warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
-            if not isinstance(data, (list, tuple)) or not isinstance(data[1], cugraph_pyg.data.GraphStore):
-                # Will eventually automatically convert these objects to cuGraph objects.
-                raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-            if directory is None:
-                warnings.warn("Setting a directory to store samples is recommended.")
-                self._tempdir = tempfile.TemporaryDirectory()
-                directory = self._tempdir.name
-
-            if compression is None:
-                compression = "CSR"
-            elif compression not in ["CSR", "COO"]:
-                raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
-
-            writer = DistSampleWriter(
-                directory=directory,
-                batches_per_partition=batches_per_partition,
-                format=format
-            )
+        format: str = "parquet",
+        compression: Optional[str] = None,
+        local_seeds_per_call: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+            See torch_geometric.loader.NeighborLoader.
+        num_neighbors: List[int] or Dict[EdgeType, List[int]]
+            Fanout values.
+            See torch_geometric.loader.NeighborLoader.
+        input_nodes: InputNodes
+            Input nodes for sampling.
+            See torch_geometric.loader.NeighborLoader.
+        input_time: OptTensor (optional)
+            See torch_geometric.loader.NeighborLoader.
+        replace: bool (optional, default=False)
+            Whether to sample with replacement.
+            See torch_geometric.loader.NeighborLoader.
+        subgraph_type: Union[SubgraphType, str] (optional, default='directional')
+            The type of subgraph to return.
+            Currently only 'directional' is supported.
+            See torch_geometric.loader.NeighborLoader.
+        disjoint: bool (optional, default=False)
+            Whether to perform disjoint sampling.
+            Currently unsupported.
+            See torch_geometric.loader.NeighborLoader.
+        temporal_strategy: str (optional, default='uniform')
+            Currently only 'uniform' is suppported.
+            See torch_geometric.loader.NeighborLoader.
+        time_attr: str (optional, default=None)
+            Used for temporal sampling.
+            See torch_geometric.loader.NeighborLoader.
+        weight_attr: str (optional, default=None)
+            Used for biased sampling.
+            See torch_geometric.loader.NeighborLoader.
+        transform: Callable (optional, default=None)
+            See torch_geometric.loader.NeighborLoader.
+        transform_sampler_output: Callable (optional, default=None)
+            See torch_geometric.loader.NeighborLoader.
+        is_sorted: bool (optional, default=False)
+            Ignored by cuGraph.
+            See torch_geometric.loader.NeighborLoader.
+        filter_per_worker: bool (optional, default=False)
+            Currently ignored by cuGraph, but this may
+            change once in-memory sampling is implemented.
+            See torch_geometric.loader.NeighborLoader.
+        neighbor_sampler: torch_geometric.sampler.NeighborSampler
+            (optional, default=None)
+            Not supported by cuGraph.
+            See torch_geometric.loader.NeighborLoader.
+        directed: bool (optional, default=True)
+            Deprecated.
+            See torch_geometric.loader.NeighborLoader.
+        batch_size: int (optional, default=16)
+            The number of input nodes per output minibatch.
+            See torch.utils.dataloader.
+        directory: str (optional, default=None)
+            The directory where samples will be temporarily stored.
+            It is recommend that this be set by the user, usually
+            setting it to a tempfile.TemporaryDirectory with a context
+            manager is a good option but depending on the filesystem,
+            you may want to choose an alternative location with fast I/O
+            intead.
+            If not set, this will create a TemporaryDirectory that will
+            persist until this object is garbage collected.
+            See cugraph.gnn.DistSampleWriter.
+        batches_per_partition: int (optional, default=256)
+            The number of batches per partition if writing samples to
+            disk.  Manually tuning this parameter is not recommended
+            but reducing it may help conserve GPU memory.
+            See cugraph.gnn.DistSampleWriter.
+        format: str (optional, default='parquet')
+            If writing samples to disk, they will be written in this
+            file format.
+            See cugraph.gnn.DistSampleWriter.
+        compression: str (optional, default=None)
+            The compression type to use if writing samples to disk.
+            If not provided, it is automatically chosen.
+        local_seeds_per_call: int (optional, default=None)
+            The number of seeds to process within a single sampling call.
+            Manually tuning this parameter is not recommended but reducing
+            it may conserve GPU memory.  The total number of seeds processed
+            per sampling call is equal to the sum of this parameter across
+            all workers.  If not provided, it will be automatically
+            calculated.
+            See cugraph.gnn.DistSampler.
+        **kwargs
+            Other keyword arguments passed to the superclass.
+        """
 
-            feature_store, graph_store = data
-            sampler = BaseSampler(
-                UniformNeighborSampler(
-                    graph_store._graph,
-                    writer,
-                    retain_original_seeds=True,
-                    fanout=num_neighbors,
-                    prior_sources_behavior='exclude',
-                    deduplicate_sources=True,
-                    compression=compression,
-                    compress_per_hop=False,
-                    with_replacement=replace,
-                    local_seeds_per_call=local_seeds_per_call,
-                ),
-                (feature_store, graph_store),
-                batch_size=batch_size
-            )
-            # TODO add heterogeneous support and pass graph_store._vertex_offsets
-
-            super().__init__(
-                (feature_store, graph_store),
-                sampler,
-                input_nodes=input_nodes,
-                input_time=input_time,
-                transform=transform,
-                transform_sampler_output=transform_sampler_output,
-                filter_per_worker=filter_per_worker,
-                batch_size=batch_size,
-                **kwargs,
+        subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
+
+        if not directed:
+            subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
+            warnings.warn(
+                "The 'directed' argument is deprecated. "
+                "Use subgraph_type='induced' instead."
             )
+        if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
+            raise ValueError("Only directional subgraphs are currently supported")
+        if disjoint:
+            raise ValueError("Disjoint sampling is currently unsupported")
+        if temporal_strategy != "uniform":
+            warnings.warn("Only the uniform temporal strategy is currently supported")
+        if neighbor_sampler is not None:
+            raise ValueError("Passing a neighbor sampler is currently unsupported")
+        if time_attr is not None:
+            raise ValueError("Temporal sampling is currently unsupported")
+        if weight_attr is not None:
+            raise ValueError("Biased sampling is currently unsupported")
+        if is_sorted:
+            warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
+        if not isinstance(data, (list, tuple)) or not isinstance(
+            data[1], cugraph_pyg.data.GraphStore
+        ):
+            # Will eventually automatically convert these objects to cuGraph objects.
+            raise NotImplementedError("Currently can't accept non-cugraph graphs")
+
+        if directory is None:
+            warnings.warn("Setting a directory to store samples is recommended.")
+            self._tempdir = tempfile.TemporaryDirectory()
+            directory = self._tempdir.name
+
+        if compression is None:
+            compression = "CSR"
+        elif compression not in ["CSR", "COO"]:
+            raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
+
+        writer = DistSampleWriter(
+            directory=directory,
+            batches_per_partition=batches_per_partition,
+            format=format,
+        )
+
+        feature_store, graph_store = data
+        sampler = BaseSampler(
+            UniformNeighborSampler(
+                graph_store._graph,
+                writer,
+                retain_original_seeds=True,
+                fanout=num_neighbors,
+                prior_sources_behavior="exclude",
+                deduplicate_sources=True,
+                compression=compression,
+                compress_per_hop=False,
+                with_replacement=replace,
+                local_seeds_per_call=local_seeds_per_call,
+            ),
+            (feature_store, graph_store),
+            batch_size=batch_size,
+        )
+        # TODO add heterogeneous support and pass graph_store._vertex_offsets
+
+        super().__init__(
+            (feature_store, graph_store),
+            sampler,
+            input_nodes=input_nodes,
+            input_time=input_time,
+            transform=transform,
+            transform_sampler_output=transform_sampler_output,
+            filter_per_worker=filter_per_worker,
+            batch_size=batch_size,
+            **kwargs,
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
index 68cbae43467..56b58352a7c 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
@@ -18,114 +18,131 @@
 
 from cugraph.utilities.utils import import_optional
 
-torch_geometric = import_optional('torch_geometric')
-torch = import_optional('torch')
+torch_geometric = import_optional("torch_geometric")
+torch = import_optional("torch")
+
 
 class NodeLoader:
     """
     Duck-typed version of torch_geometric.loader.NodeLoader
     """
 
-    def __init__(self,
-        data: Union['torch_geometric.data.Data', 'torch_geometric.data.HeteroData', Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore']],
-        node_sampler: 'cugraph_pyg.sampler.BaseSampler',
-        input_nodes: 'torch_geometric.typing.InputNodes' = None,
-        input_time: 'torch_geometric.typing.OptTensor' = None,
+    def __init__(
+        self,
+        data: Union[
+            "torch_geometric.data.Data",
+            "torch_geometric.data.HeteroData",
+            Tuple[
+                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+            ],
+        ],
+        node_sampler: "cugraph_pyg.sampler.BaseSampler",
+        input_nodes: "torch_geometric.typing.InputNodes" = None,
+        input_time: "torch_geometric.typing.OptTensor" = None,
         transform: Optional[Callable] = None,
         transform_sampler_output: Optional[Callable] = None,
         filter_per_worker: Optional[bool] = None,
-        custom_cls: Optional['torch_geometric.data.HeteroData'] = None,
-        input_id: 'torch_geometric.typing.OptTensor' = None,
+        custom_cls: Optional["torch_geometric.data.HeteroData"] = None,
+        input_id: "torch_geometric.typing.OptTensor" = None,
         batch_size: int = 1,
         shuffle: bool = False,
-        drop_last: bool = False, 
-        **kwargs,):
-            """
-            Parameters
-            ----------
-                data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-                    See torch_geometric.loader.NodeLoader.
-                node_sampler: BaseSampler
-                    See torch_geometric.loader.NodeLoader.
-                input_nodes: InputNodes
-                    See torch_geometric.loader.NodeLoader.                
-                input_time: OptTensor
-                    See torch_geometric.loader.NodeLoader.
-                transform: Callable (optional, default=None)
-                    This argument currently has no effect.
-                transform_sampler_output: Callable (optional, default=None)
-                    This argument currently has no effect.
-                filter_per_worker: bool (optional, default=False)
-                    This argument currently has no effect.
-                custom_cls: HeteroData
-                    This argument currently has no effect.  This loader will
-                    always return a Data or HeteroData object.
-                input_id: OptTensor
-                    See torch_geometric.loader.NodeLoader.
-
-            """
-            if not isinstance(data, (list, tuple)) or not isinstance(data[1], cugraph_pyg.data.GraphStore):
-                # Will eventually automatically convert these objects to cuGraph objects.
-                raise NotImplementedError("Currently can't accept non-cugraph graphs")
-            
-            if not isinstance(node_sampler, cugraph_pyg.sampler.BaseSampler):
-                raise NotImplementedError("Must provide a cuGraph sampler")
-
-            if input_time is not None:
-                raise ValueError("Temporal sampling is currently unsupported")
-            
-            if filter_per_worker:
-                warnings.warn("filter_per_worker is currently ignored")
-            
-            if custom_cls is not None:
-                warnings.warn("custom_cls is currently ignored")
-            
-            if transform is not None:
-                warnings.warn("transform is currently ignored.")
-
-            if transform_sampler_output is not None:
-                warnings.warn("transform_sampler_output is currently ignored.")
-
-            input_type, input_nodes, input_id = torch_geometric.loader.utils.get_input_nodes(
-                data,
-                input_nodes,
-                input_id,
-            )
-
-            self.__input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
-                input_id=input_id,
-                node=input_nodes,
-                time=None,
-                input_type=input_type,
-            )
-
-            self.__data = data
-
-            self.__node_sampler = node_sampler
-
-            self.__batch_size = batch_size
-            self.__shuffle = shuffle
-            self.__drop_last = drop_last
-            
-    
+        drop_last: bool = False,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
+                See torch_geometric.loader.NodeLoader.
+            node_sampler: BaseSampler
+                See torch_geometric.loader.NodeLoader.
+            input_nodes: InputNodes
+                See torch_geometric.loader.NodeLoader.
+            input_time: OptTensor
+                See torch_geometric.loader.NodeLoader.
+            transform: Callable (optional, default=None)
+                This argument currently has no effect.
+            transform_sampler_output: Callable (optional, default=None)
+                This argument currently has no effect.
+            filter_per_worker: bool (optional, default=False)
+                This argument currently has no effect.
+            custom_cls: HeteroData
+                This argument currently has no effect.  This loader will
+                always return a Data or HeteroData object.
+            input_id: OptTensor
+                See torch_geometric.loader.NodeLoader.
+
+        """
+        if not isinstance(data, (list, tuple)) or not isinstance(
+            data[1], cugraph_pyg.data.GraphStore
+        ):
+            # Will eventually automatically convert these objects to cuGraph objects.
+            raise NotImplementedError("Currently can't accept non-cugraph graphs")
+
+        if not isinstance(node_sampler, cugraph_pyg.sampler.BaseSampler):
+            raise NotImplementedError("Must provide a cuGraph sampler")
+
+        if input_time is not None:
+            raise ValueError("Temporal sampling is currently unsupported")
+
+        if filter_per_worker:
+            warnings.warn("filter_per_worker is currently ignored")
+
+        if custom_cls is not None:
+            warnings.warn("custom_cls is currently ignored")
+
+        if transform is not None:
+            warnings.warn("transform is currently ignored.")
+
+        if transform_sampler_output is not None:
+            warnings.warn("transform_sampler_output is currently ignored.")
+
+        (
+            input_type,
+            input_nodes,
+            input_id,
+        ) = torch_geometric.loader.utils.get_input_nodes(
+            data,
+            input_nodes,
+            input_id,
+        )
+
+        self.__input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
+            input_id=input_id,
+            node=input_nodes,
+            time=None,
+            input_type=input_type,
+        )
+
+        self.__data = data
+
+        self.__node_sampler = node_sampler
+
+        self.__batch_size = batch_size
+        self.__shuffle = shuffle
+        self.__drop_last = drop_last
+
     def __iter__(self):
         if self.__shuffle:
             perm = torch.randperm(self.__input_data.node.numel())
         else:
             perm = torch.arange(self.__input_data.node.numel())
-        
+
         if self.__drop_last:
             d = perm.numel() % self.__batch_size
             perm = perm[:-d]
-        
+
         input_data = torch_geometric.loader.node_loader.NodeSamplerInput(
-            input_id=None if self.__input_data.input_id is None else self.__input_data.input_id[perm],
+            input_id=None
+            if self.__input_data.input_id is None
+            else self.__input_data.input_id[perm],
             node=self.__input_data.node[perm],
-            time=None if self.__input_data.time is None else self.__input_data.time[perm],
+            time=None
+            if self.__input_data.time is None
+            else self.__input_data.time[perm],
             input_type=self.__input_data.input_type,
         )
-            
+
         return cugraph_pyg.sampler.SampleIterator(
-            self.__data,
-            self.__node_sampler.sample_from_nodes(input_data)
-        )
\ No newline at end of file
+            self.__data, self.__node_sampler.sample_from_nodes(input_data)
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
index 13322c72e83..34fe9c4463e 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.sampler.sampler import BaseSampler, SampleIterator
\ No newline at end of file
+from cugraph_pyg.sampler.sampler import BaseSampler, SampleIterator
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
index 8422a38563c..101f7b042be 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
@@ -17,10 +17,22 @@
 from cugraph.gnn import DistSampler, DistSampleReader
 
 torch = import_optional("torch")
-torch_geometric = import_optional('torch_geometric')
+torch_geometric = import_optional("torch_geometric")
+
 
 class SampleIterator:
-    def __init__(self, data: Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore'], output_iter:Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]):
+    def __init__(
+        self,
+        data: Tuple[
+            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+        ],
+        output_iter: Iterator[
+            Union[
+                "torch_geometric.sampler.HeteroSamplerOutput",
+                "torch_geometric.sampler.SamplerOutput",
+            ]
+        ],
+    ):
         self.__feature_store, self.__graph_store = data
         self.__output_iter = output_iter
 
@@ -31,8 +43,10 @@ def __next__(self):
             if sz == next_sample.col.numel():
                 col = next_sample.col
             else:
-                col = torch_geometric.edge_index.ptr2index(next_sample.col, next_sample.edge.numel())
-            
+                col = torch_geometric.edge_index.ptr2index(
+                    next_sample.col, next_sample.edge.numel()
+                )
+
             data = torch_geometric.loader.utils.filter_custom_store(
                 self.__feature_store,
                 self.__graph_store,
@@ -43,9 +57,9 @@ def __next__(self):
                 None,
             )
 
-            if 'n_id' not in data:
+            if "n_id" not in data:
                 data.n_id = next_sample.node
-            if next_sample.edge is not None and 'e_id' not in data:
+            if next_sample.edge is not None and "e_id" not in data:
                 edge = next_sample.edge.to(torch.long)
                 data.e_id = edge
 
@@ -65,7 +79,7 @@ def __next__(self):
                     col[edge_type] = col_idx
                 else:
                     col[edge_type] = torch_geometric.edge_index.ptr2index(col_idx, sz)
-                
+
             data = torch_geometric.loader.utils.filter_custom_hetero_store(
                 self.__feature_store,
                 self.__graph_store,
@@ -77,27 +91,26 @@ def __next__(self):
             )
 
             for key, node in next_sample.node.items():
-                if 'n_id' not in data[key]:
+                if "n_id" not in data[key]:
                     data[key].n_id = node
 
             for key, edge in (next_sample.edge or {}).items():
-                if edge is not None and 'e_id' not in data[key]:
+                if edge is not None and "e_id" not in data[key]:
                     edge = edge.to(torch.long)
                     data[key].e_id = edge
 
-            data.set_value_dict('batch', next_sample.batch)
-            data.set_value_dict('num_sampled_nodes', next_sample.num_sampled_nodes)
-            data.set_value_dict('num_sampled_edges', next_sample.num_sampled_edges)
+            data.set_value_dict("batch", next_sample.batch)
+            data.set_value_dict("num_sampled_nodes", next_sample.num_sampled_nodes)
+            data.set_value_dict("num_sampled_edges", next_sample.num_sampled_edges)
 
             # TODO figure out how to set input_id for heterogeneous output
         else:
             raise ValueError("Invalid output type")
-        
+
         return data
 
     def __iter__(self):
         return self
-        
 
 
 class SampleReader:
@@ -105,20 +118,28 @@ def __init__(self, base_reader: DistSampleReader):
         self.__base_reader = base_reader
         self.__num_samples_remaining = 0
         self.__index = 0
-    
+
     def __next__(self):
         if self.__num_samples_remaining == 0:
             # raw_sample_data is already a dict of tensors
-            self.__raw_sample_data, start_inclusive, end_inclusive = next(self.__base_reader)
+            self.__raw_sample_data, start_inclusive, end_inclusive = next(
+                self.__base_reader
+            )
 
-            self.__raw_sample_data['label_hop_offsets'] -= self.__raw_sample_data['label_hop_offsets'][0].clone()
-            self.__raw_sample_data['renumber_map_offsets'] -= self.__raw_sample_data['renumber_map_offsets'][0].clone()
-            if 'major_offsets' in self.__raw_sample_data:
-                self.__raw_sample_data['major_offsets'] -= self.__raw_sample_data['major_offsets'][0].clone()
+            self.__raw_sample_data["label_hop_offsets"] -= self.__raw_sample_data[
+                "label_hop_offsets"
+            ][0].clone()
+            self.__raw_sample_data["renumber_map_offsets"] -= self.__raw_sample_data[
+                "renumber_map_offsets"
+            ][0].clone()
+            if "major_offsets" in self.__raw_sample_data:
+                self.__raw_sample_data["major_offsets"] -= self.__raw_sample_data[
+                    "major_offsets"
+                ][0].clone()
 
             self.__num_samples_remaining = end_inclusive - start_inclusive + 1
             self.__index = 0
-            
+
         out = self._decode(self.__raw_sample_data, self.__index)
         self.__index += 1
         self.__num_samples_remaining -= 1
@@ -127,43 +148,58 @@ def __next__(self):
     def __iter__(self):
         return self
 
+
 class HomogeneousSampleReader(SampleReader):
     def __init__(self, base_reader: DistSampleReader):
         super().__init__(base_reader)
 
-    def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
-        fanout_length = (raw_sample_data['label_hop_offsets'].numel() - 1) // (raw_sample_data['renumber_map_offsets'].numel() - 1)
-        
-        major_offsets_start_incl = raw_sample_data['label_hop_offsets'][index * fanout_length]
-        major_offsets_end_incl = raw_sample_data['label_hop_offsets'][(index + 1) * fanout_length]
+    def __decode_csc(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
+        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
+            raw_sample_data["renumber_map_offsets"].numel() - 1
+        )
 
-        major_offsets = raw_sample_data['major_offsets'][major_offsets_start_incl : major_offsets_end_incl + 1].clone()
-        minors = raw_sample_data['minors'][major_offsets[0] : major_offsets[-1]]
-        edge_id = raw_sample_data['edge_id'][major_offsets[0] : major_offsets[-1]]
+        major_offsets_start_incl = raw_sample_data["label_hop_offsets"][
+            index * fanout_length
+        ]
+        major_offsets_end_incl = raw_sample_data["label_hop_offsets"][
+            (index + 1) * fanout_length
+        ]
+
+        major_offsets = raw_sample_data["major_offsets"][
+            major_offsets_start_incl : major_offsets_end_incl + 1
+        ].clone()
+        minors = raw_sample_data["minors"][major_offsets[0] : major_offsets[-1]]
+        edge_id = raw_sample_data["edge_id"][major_offsets[0] : major_offsets[-1]]
         # don't retrieve edge type for a homogeneous graph
 
         major_offsets -= major_offsets[0].clone()
 
-        renumber_map_start = raw_sample_data['renumber_map_offsets'][index]
-        renumber_map_end = raw_sample_data['renumber_map_offsets'][index + 1]
+        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
+        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
 
-        renumber_map = raw_sample_data['map'][renumber_map_start:renumber_map_end]
+        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
 
-        current_label_hop_offsets = raw_sample_data['label_hop_offsets'][index * fanout_length : (index + 1) * fanout_length + 1].clone()
+        current_label_hop_offsets = raw_sample_data["label_hop_offsets"][
+            index * fanout_length : (index + 1) * fanout_length + 1
+        ].clone()
         current_label_hop_offsets -= current_label_hop_offsets[0].clone()
 
         num_sampled_edges = major_offsets[current_label_hop_offsets].diff()
-        
-        num_sampled_nodes_hops = torch.tensor([
-            minors[:num_sampled_edges[:i].sum()].max() + 1
-            for i in range(1, fanout_length + 1)
-        ], device='cpu')
-        
-        num_seeds = torch.searchsorted(major_offsets, num_sampled_edges[0]).reshape((1,)).cpu()
-        num_sampled_nodes = torch.concat([
-            num_seeds,
-            num_sampled_nodes_hops.diff(prepend=num_seeds)
-        ])
+
+        num_sampled_nodes_hops = torch.tensor(
+            [
+                minors[: num_sampled_edges[:i].sum()].max() + 1
+                for i in range(1, fanout_length + 1)
+            ],
+            device="cpu",
+        )
+
+        num_seeds = (
+            torch.searchsorted(major_offsets, num_sampled_edges[0]).reshape((1,)).cpu()
+        )
+        num_sampled_nodes = torch.concat(
+            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
+        )
 
         return torch_geometric.sampler.SamplerOutput(
             node=renumber_map.cpu(),
@@ -175,38 +211,48 @@ def __decode_csc(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
             num_sampled_edges=num_sampled_edges.cpu(),
         )
 
-    def __decode_coo(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
-        fanout_length = (raw_sample_data['label_hop_offsets'].numel() - 1) // (raw_sample_data['renumber_map_offsets'].numel() - 1)
-        
-        major_minor_start = raw_sample_data['label_hop_offsets'][index * fanout_length]
+    def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
+        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
+            raw_sample_data["renumber_map_offsets"].numel() - 1
+        )
+
+        major_minor_start = raw_sample_data["label_hop_offsets"][index * fanout_length]
         ix_end = (index + 1) * fanout_length
-        if ix_end == raw_sample_data['label_hop_offsets'].numel():
-            major_minor_end = raw_sample_data['majors'].numel()
+        if ix_end == raw_sample_data["label_hop_offsets"].numel():
+            major_minor_end = raw_sample_data["majors"].numel()
         else:
-            major_minor_end = raw_sample_data['label_hop_offsets'][ix_end]
+            major_minor_end = raw_sample_data["label_hop_offsets"][ix_end]
 
-        majors = raw_sample_data['majors'][major_minor_start:major_minor_end]
-        minors = raw_sample_data['minors'][major_minor_start:major_minor_end]
-        edge_id = raw_sample_data['edge_id'][major_minor_start:major_minor_end]
+        majors = raw_sample_data["majors"][major_minor_start:major_minor_end]
+        minors = raw_sample_data["minors"][major_minor_start:major_minor_end]
+        edge_id = raw_sample_data["edge_id"][major_minor_start:major_minor_end]
         # don't retrieve edge type for a homogeneous graph
 
-        renumber_map_start = raw_sample_data['renumber_map_offsets'][index]
-        renumber_map_end = raw_sample_data['renumber_map_offsets'][index + 1]
+        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
+        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
 
-        renumber_map = raw_sample_data['map'][renumber_map_start:renumber_map_end]
+        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
 
-        num_sampled_edges = raw_sample_data['label_hop_offsets'][index * fanout_length : (index + 1) * fanout_length + 1].diff().cpu()
-        
-        num_seeds = (majors[:num_sampled_edges[0]].max() + 1).reshape((1,)).cpu()
-        num_sampled_nodes_hops = torch.tensor([
-            minors[:num_sampled_edges[:i].sum()].max() + 1
-            for i in range(1, fanout_length + 1)
-        ], device='cpu')
-        
-        num_sampled_nodes = torch.concat([
-            num_seeds,
-            num_sampled_nodes_hops.diff(prepend=num_seeds)
-        ])
+        num_sampled_edges = (
+            raw_sample_data["label_hop_offsets"][
+                index * fanout_length : (index + 1) * fanout_length + 1
+            ]
+            .diff()
+            .cpu()
+        )
+
+        num_seeds = (majors[: num_sampled_edges[0]].max() + 1).reshape((1,)).cpu()
+        num_sampled_nodes_hops = torch.tensor(
+            [
+                minors[: num_sampled_edges[:i].sum()].max() + 1
+                for i in range(1, fanout_length + 1)
+            ],
+            device="cpu",
+        )
+
+        num_sampled_nodes = torch.concat(
+            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
+        )
 
         return torch_geometric.sampler.SamplerOutput(
             node=renumber_map.cpu(),
@@ -218,35 +264,60 @@ def __decode_coo(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
             num_sampled_edges=num_sampled_edges,
         )
 
-    def _decode(self, raw_sample_data: Dict[str, 'torch.Tensor'], index: int):
-        if 'major_offsets' in raw_sample_data:
+    def _decode(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
+        if "major_offsets" in raw_sample_data:
             return self.__decode_csc(raw_sample_data, index)
         else:
             return self.__decode_coo(raw_sample_data, index)
 
+
 class BaseSampler:
-    def __init__(self, sampler: DistSampler, data: Tuple['torch_geometric.data.FeatureStore', 'torch_geometric.data.GraphStore'], batch_size:int=16):
+    def __init__(
+        self,
+        sampler: DistSampler,
+        data: Tuple[
+            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
+        ],
+        batch_size: int = 16,
+    ):
         self.__sampler = sampler
         self.__feature_store, self.__graph_store = data
         self.__batch_size = batch_size
 
-    def sample_from_nodes(self, index: 'torch_geometric.sampler.NodeSamplerInput', **kwargs) -> Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]:
+    def sample_from_nodes(
+        self, index: "torch_geometric.sampler.NodeSamplerInput", **kwargs
+    ) -> Iterator[
+        Union[
+            "torch_geometric.sampler.HeteroSamplerOutput",
+            "torch_geometric.sampler.SamplerOutput",
+        ]
+    ]:
         self.__sampler.sample_from_nodes(
-            index.node,
-            batch_size=self.__batch_size,
-            **kwargs
+            index.node, batch_size=self.__batch_size, **kwargs
         )
 
         edge_attrs = self.__graph_store.get_all_edge_attrs()
-        if len(edge_attrs) == 1 and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]:
-            return HomogeneousSampleReader(
-                self.__sampler.get_reader()
-            )
+        if (
+            len(edge_attrs) == 1
+            and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
+        ):
+            return HomogeneousSampleReader(self.__sampler.get_reader())
         else:
             # TODO implement heterogeneous sampling
             raise NotImplementedError(
-                "Sampling heterogeneous graphs is currently unsupported in the non-dask API"
+                "Sampling heterogeneous graphs is currently"
+                " unsupported in the non-dask API"
             )
 
-    def sample_from_edges(self, index: 'torch_geometric.sampler.EdgeSamplerInput', neg_sampling: Optional['torch_geometric.sampler.NegativeSampling'], **kwargs) -> Iterator[Union['torch_geometric.sampler.HeteroSamplerOutput', 'torch_geometric.sampler.SamplerOutput']]:
-        raise NotImplementedError("Edge sampling is currently unimplemented.")
\ No newline at end of file
+    def sample_from_edges(
+        self,
+        index: "torch_geometric.sampler.EdgeSamplerInput",
+        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"],
+        **kwargs,
+    ) -> Iterator[
+        Union[
+            "torch_geometric.sampler.HeteroSamplerOutput",
+            "torch_geometric.sampler.SamplerOutput",
+        ]
+    ]:
+        raise NotImplementedError("Edge sampling is currently unimplemented.")
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
index 599dea262db..3cc16a1b938 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
@@ -401,4 +401,3 @@ def _sampler_output_from_sampling_results_heterogeneous(
         num_sampled_edges=num_edges_per_hop_dict,
         metadata=metadata,
     )
-
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index dfa522e6047..44f9707cf82 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
     "pylibcugraphops==24.6.*",
+    "tensordict",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index 11f7940df54..20e0391fff1 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -36,37 +36,44 @@
 
 
 class DistSampleReader:
-    def __init__(self, directory:str, *, format: str = "parquet", rank:Optional[int] = None, filelist=None):
+    def __init__(
+        self,
+        directory: str,
+        *,
+        format: str = "parquet",
+        rank: Optional[int] = None,
+        filelist=None,
+    ):
         self.__format = format
         self.__directory = directory
 
         if format != "parquet":
             raise ValueError("Invalid format (currently supported: 'parquet')")
-        
+
         if filelist is None:
             files = os.listdir(directory)
-            ex = re.compile(r'batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet')
+            ex = re.compile(r"batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet")
             filematch = [ex.match(f) for f in files]
             filematch = [f for f in filematch if f]
             filematch = [f for f in filematch if int(f[1]) == rank]
 
             batch_count = sum([int(f[4]) - int(f[2]) + 1 for f in filematch])
             filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)
-            
+
             self.__files = filematch
         else:
             self.__files = list(filelist)
-        
+
         if rank is None:
             self.__batch_count = batch_count
         else:
-            batch_count = torch.tensor([batch_count], device='cuda')
+            batch_count = torch.tensor([batch_count], device="cuda")
             torch.distributed.all_reduce(batch_count, torch.distributed.ReduceOp.MIN)
             self.__batch_count = int(batch_count)
-    
+
     def __iter__(self):
         return self
-    
+
     def __next__(self):
         if len(self.__files) > 0:
             f = self.__files.pop()
@@ -74,24 +81,25 @@ def __next__(self):
             start_inclusive = int(f[2])
             end_inclusive = int(f[4])
 
-            if(end_inclusive - start_inclusive + 1) > self.__batch_count:
+            if (end_inclusive - start_inclusive + 1) > self.__batch_count:
                 end_inclusive = start_inclusive + self.__batch_count - 1
                 self.__batch_count = 0
             else:
-                self.__batch_count -= (end_inclusive - start_inclusive + 1)
+                self.__batch_count -= end_inclusive - start_inclusive + 1
 
             df = cudf.read_parquet(os.path.join(self.__directory, fname))
             tensors = {}
             for col in list(df.columns):
                 s = df[col].dropna()
                 if len(s) > 0:
-                    tensors[col] = torch.as_tensor(s, device='cuda')
+                    tensors[col] = torch.as_tensor(s, device="cuda")
                 df.drop(col, axis=1, inplace=True)
-            
+
             return tensors, start_inclusive, end_inclusive
 
         raise StopIteration
 
+
 class DistSampleWriter:
     def __init__(
         self,
@@ -131,8 +139,10 @@ def _directory(self):
     @property
     def _batches_per_partition(self):
         return self.__batches_per_partition
-    
-    def get_reader(self, rank: int) -> Iterator[Tuple[Dict[str, 'torch.Tensor'], int, int]]:
+
+    def get_reader(
+        self, rank: int
+    ) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
         """
         Returns an iterator over sampled data.
         """
@@ -250,7 +260,6 @@ def __write_minibatches_csr(self, minibatch_dict):
         fanout_length = (len(minibatch_dict["label_hop_offsets"]) - 1) // len(
             minibatch_dict["batch_id"]
         )
-        rank_batch_offset = minibatch_dict["batch_id"][0]
 
         for p in range(
             0, int(ceil(len(minibatch_dict["batch_id"]) / self.__batches_per_partition))
@@ -266,11 +275,18 @@ def __write_minibatches_csr(self, minibatch_dict):
             start_batch_id = batch_id_array_p[0]
 
             # major offsets and minors
-            major_offsets_start_incl, major_offsets_end_incl = label_hop_offsets_array_p[[0, -1]]
+            (
+                major_offsets_start_incl,
+                major_offsets_end_incl,
+            ) = label_hop_offsets_array_p[[0, -1]]
 
-            start_ix,end_ix = minibatch_dict['major_offsets'][[major_offsets_start_incl, major_offsets_end_incl]]
+            start_ix, end_ix = minibatch_dict["major_offsets"][
+                [major_offsets_start_incl, major_offsets_end_incl]
+            ]
 
-            major_offsets_array_p = minibatch_dict["major_offsets"][major_offsets_start_incl : major_offsets_end_incl + 1]
+            major_offsets_array_p = minibatch_dict["major_offsets"][
+                major_offsets_start_incl : major_offsets_end_incl + 1
+            ]
 
             minors_array_p = minibatch_dict["minors"][start_ix:end_ix]
             edge_id_array_p = (
@@ -350,7 +366,7 @@ def __init__(
         graph: Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph],
         writer: DistSampleWriter,
         local_seeds_per_call: int,
-        retain_original_seeds: bool = False, 
+        retain_original_seeds: bool = False,
     ):
         """
         Parameters
@@ -382,8 +398,7 @@ def __init__(
         self.__handle = None
         self.__retain_original_seeds = retain_original_seeds
 
-
-    def get_reader(self) -> Iterator[Tuple[Dict[str, 'torch.Tensor'], int, int]]:
+    def get_reader(self) -> Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]:
         """
         Returns an iterator over sampled data.
         """
@@ -682,19 +697,20 @@ def __init__(
             retain_original_seeds=retain_original_seeds,
         )
 
-    def __calc_local_seeds_per_call(self, local_seeds_per_call: Optional[int]=None):
+    def __calc_local_seeds_per_call(self, local_seeds_per_call: Optional[int] = None):
         if local_seeds_per_call is None:
             if len([x for x in self.__fanout if x <= 0]) > 0:
                 return UniformNeighborSampler.UNKNOWN_VERTICES_DEFAULT
-            
+
             total_memory = torch.cuda.get_device_properties(0).total_memory
-            fanout_prod = reduce(lambda x, y : x * y, self.__fanout)
+            fanout_prod = reduce(lambda x, y: x * y, self.__fanout)
             return int(
-                UniformNeighborSampler.BASE_VERTICES_PER_BYTE * total_memory / fanout_prod           
+                UniformNeighborSampler.BASE_VERTICES_PER_BYTE
+                * total_memory
+                / fanout_prod
             )
-        
-        return local_seeds_per_call
 
+        return local_seeds_per_call
 
     def sample_batches(
         self,
@@ -714,10 +730,14 @@ def sample_batches(
             )
 
             if self._retain_original_seeds:
-                label_offsets = torch.concat([
-                    torch.searchsorted(batch_ids, local_label_list),
-                    torch.tensor([batch_ids.shape[0]], device='cuda', dtype=torch.int64)
-                ])
+                label_offsets = torch.concat(
+                    [
+                        torch.searchsorted(batch_ids, local_label_list),
+                        torch.tensor(
+                            [batch_ids.shape[0]], device="cuda", dtype=torch.int64
+                        ),
+                    ]
+                )
             else:
                 label_offsets = None
 
@@ -748,10 +768,14 @@ def sample_batches(
             if self._retain_original_seeds:
                 batch_ids = batch_ids.to(device="cuda", dtype=torch.int32)
                 local_label_list = torch.unique(batch_ids)
-                label_offsets = torch.concat([
-                    torch.searchsorted(batch_ids, local_label_list),
-                    torch.tensor([batch_ids.shape[0]], device='cuda', dtype=torch.int64)
-                ])
+                label_offsets = torch.concat(
+                    [
+                        torch.searchsorted(batch_ids, local_label_list),
+                        torch.tensor(
+                            [batch_ids.shape[0]], device="cuda", dtype=torch.int64
+                        ),
+                    ]
+                )
             else:
                 label_offsets = None
 
@@ -771,8 +795,8 @@ def sample_batches(
                 renumber=True,
                 compression=self.__compression,
                 compress_per_hop=self.__compress_per_hop,
-                retain_seeds = self._retain_original_seeds,
-                label_offsets = cupy.asarray(label_offsets),
+                retain_seeds=self._retain_original_seeds,
+                label_offsets=cupy.asarray(label_offsets),
                 return_dict=True,
             )
 
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 75b5c1c5aa9..df80ae56a96 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From c09877a59582d6069d2f197e3afdb42bd1677231 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 8 May 2024 15:08:29 -0700
Subject: [PATCH 46/80] remove tensordict from run

---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 -
 conda/environments/all_cuda-122_arch-x86_64.yaml | 1 -
 dependencies.yaml                                | 1 -
 python/cugraph-pyg/pyproject.toml                | 1 -
 4 files changed, 4 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 3679ff30ad5..659a2b911fb 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -69,7 +69,6 @@ dependencies:
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
-- tensordict
 - ucx-proc=*=gpu
 - ucx-py==0.38.*
 - wget
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 0fa09fd4742..377e4151f9b 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -74,7 +74,6 @@ dependencies:
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
-- tensordict
 - ucx-proc=*=gpu
 - ucx-py==0.38.*
 - wget
diff --git a/dependencies.yaml b/dependencies.yaml
index cd0e95d2a08..de6b6eb6c5c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -481,7 +481,6 @@ dependencies:
         packages:
           - *numba
           - *numpy
-          - tensordict
       - output_types: [pyproject]
         packages:
           - *cugraph
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index 44f9707cf82..dfa522e6047 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -31,7 +31,6 @@ dependencies = [
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
     "pylibcugraphops==24.6.*",
-    "tensordict",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From f0e3988e9378b0481efc6e45d0f9dfb0408cbddf Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Thu, 9 May 2024 00:29:14 -0400
Subject: [PATCH 47/80] dependencies.yaml

---
 dependencies.yaml                                      | 2 +-
 python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 9dca069ea33..c0699fdb1c5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -565,7 +565,7 @@ dependencies:
           - cugraph==24.6.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - pyg>=2.4.0
+          - pyg>=2.5,<2.6
 
   depends_on_rmm:
     common:
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 94e9f1decbd..ebef0094cfa 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -12,7 +12,7 @@ dependencies:
 - cugraph==24.6.*
 - pandas
 - pre-commit
-- pyg>=2.4.0
+- pyg>=2.5,<2.6
 - pylibcugraphops==24.6.*
 - pytest
 - pytest-benchmark

From ce0db3db0dc4a0fe34d53548fadcc20394495a80 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Mon, 13 May 2024 13:06:07 -0700
Subject: [PATCH 48/80] fixes

---
 .../cugraph_pyg/sampler/cugraph_sampler.py           | 12 ++++++------
 .../cugraph_pyg/tests/test_cugraph_loader.py         |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index ffab54efe08..d336cd51cb3 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -171,8 +171,8 @@ def _sampler_output_from_sampling_results_homogeneous_coo(
         row=row_dict,
         col=col_dict,
         edge=None,
-        num_sampled_nodes=num_nodes_per_hop_dict,
-        num_sampled_edges=num_edges_per_hop_dict,
+        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
+        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
         metadata=metadata,
     )
 
@@ -222,7 +222,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
     major_offsets = major_offsets.clone() - major_offsets[0]
     label_hop_offsets = label_hop_offsets.clone() - label_hop_offsets[0]
 
-    num_edges_per_hop_dict = {edge_type: major_offsets[label_hop_offsets].diff().cpu()}
+    num_edges_per_hop_dict = {edge_type: major_offsets[label_hop_offsets].diff().tolist()}
 
     label_hop_offsets = label_hop_offsets.cpu()
     num_nodes_per_hop_dict = {
@@ -231,7 +231,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
                 label_hop_offsets.diff(),
                 (renumber_map.shape[0] - label_hop_offsets[-1]).reshape((1,)),
             ]
-        ).cpu()
+        ).tolist()
     }
 
     noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
@@ -397,8 +397,8 @@ def _sampler_output_from_sampling_results_heterogeneous(
         row=row_dict,
         col=col_dict,
         edge=None,
-        num_sampled_nodes=num_nodes_per_hop_dict,
-        num_sampled_edges=num_edges_per_hop_dict,
+        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
+        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
         metadata=metadata,
     )
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 75549e9d313..3a8d221a0ac 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -282,8 +282,8 @@ def test_cugraph_loader_from_disk_subset_csr():
         )
         assert row.tolist() == bogus_samples.minors.dropna().values_host.tolist()
 
-        assert sample["t0"]["num_sampled_nodes"].tolist() == [1, 3, 2]
-        assert sample["t0", "knows", "t0"]["num_sampled_edges"].tolist() == [3, 5]
+        assert sample["t0"]["num_sampled_nodes"] == [1, 3, 2]
+        assert sample["t0", "knows", "t0"]["num_sampled_edges"] == [3, 5]
 
     assert num_samples == 100
 
@@ -337,10 +337,10 @@ def test_cugraph_loader_e2e_coo():
     for hetero_data in loader:
         ei = hetero_data["t0", "knows", "t0"]["edge_index"]
         x = hetero_data["t0"]["x"].cuda()
-        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"].tolist()
+        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
         num_sampled_edges = hetero_data["t0", "knows", "t0"][
             "num_sampled_edges"
-        ].tolist()
+        ]
 
         for i in range(len(convs)):
             x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)

From b1943a7478856ad872b755928b4bac67ed1caf89 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Mon, 13 May 2024 23:39:11 -0400
Subject: [PATCH 49/80] format

---
 python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py   | 4 +++-
 python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index d336cd51cb3..8bcfb783ae1 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -222,7 +222,9 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
     major_offsets = major_offsets.clone() - major_offsets[0]
     label_hop_offsets = label_hop_offsets.clone() - label_hop_offsets[0]
 
-    num_edges_per_hop_dict = {edge_type: major_offsets[label_hop_offsets].diff().tolist()}
+    num_edges_per_hop_dict = {
+        edge_type: major_offsets[label_hop_offsets].diff().tolist()
+    }
 
     label_hop_offsets = label_hop_offsets.cpu()
     num_nodes_per_hop_dict = {
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 3a8d221a0ac..ab20ef01fd3 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -338,9 +338,7 @@ def test_cugraph_loader_e2e_coo():
         ei = hetero_data["t0", "knows", "t0"]["edge_index"]
         x = hetero_data["t0"]["x"].cuda()
         num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
-        num_sampled_edges = hetero_data["t0", "knows", "t0"][
-            "num_sampled_edges"
-        ]
+        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
 
         for i in range(len(convs)):
             x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)

From 3d19449cd7a1cc0477a5ef5cd81cf74970517c30 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 11:07:29 -0700
Subject: [PATCH 50/80] fix method name

---
 python/cugraph-pyg/cugraph_pyg/data/graph_store.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 1ba712a4dbd..9bac04fc3b0 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -160,7 +160,7 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                     cupy.asarray(edgelist_dict["src"]),
                     cupy.asarray(edgelist_dict["dst"]),
                     vertices_array=cupy.arange(
-                        sum(self._num_vertices.values()), dtype="int64"
+                        sum(self._num_vertices().values()), dtype="int64"
                     ),
                     edge_id_array=cupy.asarray(edgelist_dict["eid"]),
                     edge_type_array=cupy.asarray(edgelist_dict["etp"]),

From 28efdb4403d2463412068d126b87d126d65e65f9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 11:18:24 -0700
Subject: [PATCH 51/80] don't do expensive check

---
 python/cugraph/cugraph/sampling/uniform_neighbor_sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index eafadfa4ff0..2e3d032a20f 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -353,7 +353,7 @@ def uniform_neighbor_sample(
         else None,
         h_fan_out=fanout_vals,
         with_replacement=with_replacement,
-        do_expensive_check=True,
+        do_expensive_check=False,
         with_edge_properties=with_edge_properties,
         random_state=random_state,
         prior_sources_behavior=prior_sources_behavior,

From aed0555c43b5f5ff415a87c16dd5d7d50d025149 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 11:21:55 -0700
Subject: [PATCH 52/80] revert readme

---
 python/nx-cugraph/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index df80ae56a96..75b5c1c5aa9 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From 6dc5e095fae0893679ae7dfa2a8508f118ff46f2 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 11:30:32 -0700
Subject: [PATCH 53/80] update rst

---
 .../source/api_docs/cugraph-pyg/cugraph_pyg.rst    | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
index a150d4db9fe..1bbbefe7e5b 100644
--- a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
+++ b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
@@ -9,5 +9,15 @@ cugraph-pyg
 .. autosummary::
    :toctree: ../api/cugraph-pyg/
 
-..   cugraph_pyg.data.cugraph_store.EXPERIMENTAL__CuGraphStore
-..   cugraph_pyg.sampler.cugraph_sampler.EXPERIMENTAL__CuGraphSampler
+..   cugraph_pyg.data.dask_graph_store.DaskGraphStore
+..   cugraph_pyg.data.graph_store.GraphStore
+..   cugraph_pyg.data.feature_store.TensorDictFeatureStore
+..   cugraph_pyg.loader.dask_node_loader.DaskNeighborLoader
+..   cugraph_pyg.loader.dask_node_loader.BulkSampleLoader
+..   cugraph_pyg.loader.node_loader.NodeLoader
+..   cugraph_pyg.loader.neighbor_loader.NeighborLoader
+..   cugraph_pyg.sampler.sampler.BaseSampler
+..   cugraph_pyg.sampler.sampler.SampleReader
+..   cugraph_pyg.sampler.sampler.HomogeneousSampleReader
+..   cugraph_pyg.sampler.sampler.SampleIterator
+

From d3f04ebe2765b6f240591ac476254ebe74f50a1e Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 11:34:37 -0700
Subject: [PATCH 54/80] update rst

---
 .../api_docs/cugraph-pyg/cugraph_pyg.rst      | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
index 1bbbefe7e5b..38f295fc5ea 100644
--- a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
+++ b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
@@ -6,18 +6,37 @@ cugraph-pyg
 
 .. currentmodule:: cugraph_pyg
 
+Graph Storage
+-------------
 .. autosummary::
    :toctree: ../api/cugraph-pyg/
 
 ..   cugraph_pyg.data.dask_graph_store.DaskGraphStore
 ..   cugraph_pyg.data.graph_store.GraphStore
+
+Feature Storage
+---------------
+.. autosummary::
+   :toctree: ../api/cugraph-pyg/
+
 ..   cugraph_pyg.data.feature_store.TensorDictFeatureStore
+
+Data Loaders
+------------
+.. autosummary::
+   :toctree: ../api/cugraph-pyg/
+
 ..   cugraph_pyg.loader.dask_node_loader.DaskNeighborLoader
 ..   cugraph_pyg.loader.dask_node_loader.BulkSampleLoader
 ..   cugraph_pyg.loader.node_loader.NodeLoader
 ..   cugraph_pyg.loader.neighbor_loader.NeighborLoader
+
+Samplers
+--------
+.. autosummary::
+   :toctree: ../api/cugraph-pyg/
+
 ..   cugraph_pyg.sampler.sampler.BaseSampler
 ..   cugraph_pyg.sampler.sampler.SampleReader
 ..   cugraph_pyg.sampler.sampler.HomogeneousSampleReader
 ..   cugraph_pyg.sampler.sampler.SampleIterator
-

From ed28ae6c70d14b87d9270f68961f4c8a97175925 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 11:37:21 -0700
Subject: [PATCH 55/80] change rst format

---
 .../api_docs/cugraph-pyg/cugraph_pyg.rst      | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
index 38f295fc5ea..5475fd6c581 100644
--- a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
+++ b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
@@ -11,32 +11,32 @@ Graph Storage
 .. autosummary::
    :toctree: ../api/cugraph-pyg/
 
-..   cugraph_pyg.data.dask_graph_store.DaskGraphStore
-..   cugraph_pyg.data.graph_store.GraphStore
+   cugraph_pyg.data.dask_graph_store.DaskGraphStore
+   cugraph_pyg.data.graph_store.GraphStore
 
 Feature Storage
 ---------------
 .. autosummary::
    :toctree: ../api/cugraph-pyg/
 
-..   cugraph_pyg.data.feature_store.TensorDictFeatureStore
+   cugraph_pyg.data.feature_store.TensorDictFeatureStore
 
 Data Loaders
 ------------
 .. autosummary::
    :toctree: ../api/cugraph-pyg/
 
-..   cugraph_pyg.loader.dask_node_loader.DaskNeighborLoader
-..   cugraph_pyg.loader.dask_node_loader.BulkSampleLoader
-..   cugraph_pyg.loader.node_loader.NodeLoader
-..   cugraph_pyg.loader.neighbor_loader.NeighborLoader
+   cugraph_pyg.loader.dask_node_loader.DaskNeighborLoader
+   cugraph_pyg.loader.dask_node_loader.BulkSampleLoader
+   cugraph_pyg.loader.node_loader.NodeLoader
+   cugraph_pyg.loader.neighbor_loader.NeighborLoader
 
 Samplers
 --------
 .. autosummary::
    :toctree: ../api/cugraph-pyg/
 
-..   cugraph_pyg.sampler.sampler.BaseSampler
-..   cugraph_pyg.sampler.sampler.SampleReader
-..   cugraph_pyg.sampler.sampler.HomogeneousSampleReader
-..   cugraph_pyg.sampler.sampler.SampleIterator
+   cugraph_pyg.sampler.sampler.BaseSampler
+   cugraph_pyg.sampler.sampler.SampleReader
+   cugraph_pyg.sampler.sampler.HomogeneousSampleReader
+   cugraph_pyg.sampler.sampler.SampleIterator

From 8d578040c1f373101136477bdfc7b5916fd6da10 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 12:01:06 -0700
Subject: [PATCH 56/80] propagate rename

---
 .../cugraph-pyg/cugraph_pyg/data/__init__.py  |  2 +-
 .../cugraph_pyg/data/dask_graph_store.py      |  2 +-
 .../cugraph_pyg/examples/graph_sage_mg.py     | 10 +++---
 .../cugraph_pyg/examples/graph_sage_sg.py     |  8 ++---
 .../cugraph_pyg/loader/__init__.py            |  2 +-
 .../cugraph_pyg/sampler/sampler_utils.py      | 18 +++++-----
 .../tests/mg/test_mg_cugraph_loader.py        | 12 +++----
 .../tests/mg/test_mg_cugraph_sampler.py       |  8 ++---
 .../tests/mg/test_mg_cugraph_store.py         | 34 +++++++++----------
 .../cugraph_pyg/tests/test_cugraph_loader.py  | 30 ++++++++--------
 .../cugraph_pyg/tests/test_cugraph_sampler.py |  8 ++---
 .../cugraph_pyg/tests/test_cugraph_store.py   | 30 ++++++++--------
 12 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
index c2ff510821c..4c6f267410d 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -19,5 +19,5 @@
 
 
 def CuGraphStore(*args, **kwargs):
-    warnings.warn("CuGraphStore has been renamed to DaskGraphStore")
+    warnings.warn("CuGraphStore has been renamed to DaskGraphStore", FutureWarning)
     return DaskGraphStore(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
index 40c979d5b0b..ef22982c4da 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
@@ -221,7 +221,7 @@ def __init__(
         order: str = "CSR",
     ):
         """
-        Constructs a new CuGraphStore from the provided
+        Constructs a new DaskGraphStore from the provided
         arguments.
 
         Parameters
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
index 80d683e6c79..7fb0bf69aee 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
@@ -159,8 +159,8 @@ def train(
     td.barrier()
 
     import cugraph
-    from cugraph_pyg.data import CuGraphStore
-    from cugraph_pyg.loader import CuGraphNeighborLoader
+    from cugraph_pyg.data import DaskGraphStore
+    from cugraph_pyg.loader import DaskNeighborLoader
 
     if rank == 0:
         print("Rank 0 downloading dataset")
@@ -212,7 +212,7 @@ def train(
         # Rank 0 will initialize the distributed cugraph graph.
         cugraph_store_create_start = time.perf_counter_ns()
         print("G:", G[("paper", "cites", "paper")].shape)
-        cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True)
+        cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
         cugraph_store_create_end = time.perf_counter_ns()
         print(
             "cuGraph Store created on rank 0 in "
@@ -237,7 +237,7 @@ def train(
 
             # Will automatically use the stored distributed cugraph graph on rank 0.
             cugraph_store_create_start = time.perf_counter_ns()
-            cugraph_store = CuGraphStore(fs, G, N, multi_gpu=True)
+            cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
             cugraph_store_create_end = time.perf_counter_ns()
             print(
                 f"Rank {rank} created cugraph store in "
@@ -269,7 +269,7 @@ def train(
         model.train()
 
         start_time_loader = time.perf_counter_ns()
-        cugraph_bulk_loader = CuGraphNeighborLoader(
+        cugraph_bulk_loader = DaskNeighborLoader(
             cugraph_store,
             train_nodes,
             batch_size=250,
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
index 58a403084df..e0169ee2c25 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
@@ -74,8 +74,8 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
     init_pytorch_worker(device)
 
     import cugraph
-    from cugraph_pyg.data import CuGraphStore
-    from cugraph_pyg.loader import CuGraphNeighborLoader
+    from cugraph_pyg.data import DaskGraphStore
+    from cugraph_pyg.loader import DaskNeighborLoader
 
     from ogb.nodeproppred import NodePropPredDataset
 
@@ -106,7 +106,7 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
 
     fs.add_data(train_mask, "paper", "train")
 
-    cugraph_store = CuGraphStore(fs, G, N)
+    cugraph_store = DaskGraphStore(fs, G, N)
 
     model = (
         CuGraphSAGE(in_channels=128, hidden_channels=64, out_channels=349, num_layers=3)
@@ -120,7 +120,7 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
         start_time_train = time.perf_counter_ns()
         model.train()
 
-        cugraph_bulk_loader = CuGraphNeighborLoader(
+        cugraph_bulk_loader = DaskNeighborLoader(
             cugraph_store, train_nodes, batch_size=500, num_neighbors=[10, 25]
         )
 
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index c8ab46f4205..384329c11c6 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -22,5 +22,5 @@
 
 
 def CuGraphNeighborLoader(*args, **kwargs):
-    warnings.warn("CuGraphNeighborLoader has been renamed to DaskNeighborLoader")
+    warnings.warn("CuGraphNeighborLoader has been renamed to DaskNeighborLoader", FutureWarning)
     return DaskNeighborLoader(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
index 3cc16a1b938..ea409a1b51f 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
@@ -14,7 +14,7 @@
 
 from typing import Sequence, Dict, Tuple
 
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 
 from cugraph.utilities.utils import import_optional
 import cudf
@@ -28,7 +28,7 @@
 
 def _get_unique_nodes(
     sampling_results: cudf.DataFrame,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     node_type: str,
     node_position: str,
 ) -> int:
@@ -40,7 +40,7 @@ def _get_unique_nodes(
     sampling_results: cudf.DataFrame
         The dataframe containing sampling results or filtered sampling results
         (i.e. sampling results for hop 2)
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     node_type: str
         The node type to count the number of unique nodes of.
@@ -81,7 +81,7 @@ def _get_unique_nodes(
 def _sampler_output_from_sampling_results_homogeneous_coo(
     sampling_results: cudf.DataFrame,
     renumber_map: torch.Tensor,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     data_index: Dict[Tuple[int, int], Dict[str, int]],
     batch_id: int,
     metadata: Sequence = None,
@@ -94,7 +94,7 @@ def _sampler_output_from_sampling_results_homogeneous_coo(
     renumber_map: torch.Tensor
         The tensor containing the renumber map, or None if there
         is no renumber map.
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     data_index: Dict[Tuple[int, int], Dict[str, int]]
         Dictionary where keys are the batch id and hop id,
@@ -181,7 +181,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
     major_offsets: torch.Tensor,
     minors: torch.Tensor,
     renumber_map: torch.Tensor,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     label_hop_offsets: torch.Tensor,
     batch_id: int,
     metadata: Sequence = None,
@@ -196,7 +196,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
     renumber_map: torch.Tensor
         The tensor containing the renumber map.
         Required.
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     label_hop_offsets: torch.Tensor
         The tensor containing the label-hop offsets.
@@ -261,7 +261,7 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
 def _sampler_output_from_sampling_results_heterogeneous(
     sampling_results: cudf.DataFrame,
     renumber_map: cudf.Series,
-    graph_store: CuGraphStore,
+    graph_store: DaskGraphStore,
     metadata: Sequence = None,
 ) -> HeteroSamplerOutput:
     """
@@ -272,7 +272,7 @@ def _sampler_output_from_sampling_results_heterogeneous(
     renumber_map: cudf.Series
         The series containing the renumber map, or None if there
         is no renumber map.
-    graph_store: CuGraphStore
+    graph_store: DaskGraphStore
         The graph store containing the structure of the sampled graph.
     metadata: Tensor
         The metadata for the sampled batch.
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
index f5035a38621..7be67b3b1aa 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
@@ -13,8 +13,8 @@
 
 import pytest
 
-from cugraph_pyg.loader import CuGraphNeighborLoader
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.loader import DaskNeighborLoader
+from cugraph_pyg.data import DaskGraphStore
 from cugraph.utilities.utils import import_optional, MissingModule
 
 torch = import_optional("torch")
@@ -23,8 +23,8 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_basic(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
         10,
@@ -51,8 +51,8 @@ def test_cugraph_loader_basic(dask_client, karate_gnn):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_hetero(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
         batch_size=2,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
index 80a2d0a6c79..eb421fd9180 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 from cugraph_pyg.sampler.cugraph_sampler import (
     _sampler_output_from_sampling_results_heterogeneous,
 )
@@ -33,7 +33,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample(dask_client, basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -98,7 +98,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 @pytest.mark.skip(reason="broken")
 def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -190,7 +190,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
         torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
     )
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index 7047c62250b..edbd0ec8cc2 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -17,7 +17,7 @@
     CuGraphEdgeAttr,
     EdgeLayout,
 )
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 
 import cudf
 import dask_cudf
@@ -120,7 +120,7 @@ def test_get_edge_index(graph, edge_index_type, dask_client):
             G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=1)
             G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=1)
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSC", multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, order="CSC", multi_gpu=True)
 
     for pyg_can_edge_type in G:
         src, dst = cugraph_store.get_edge_index(
@@ -145,7 +145,7 @@ def test_get_edge_index(graph, edge_index_type, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_edge_types(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     eta = cugraph_store._edge_types_to_attrs
     assert eta.keys() == G.keys()
@@ -161,7 +161,7 @@ def test_edge_types(graph, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_subgraph(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     if len(G.keys()) > 1:
         for edge_type in G.keys():
@@ -179,7 +179,7 @@ def test_get_subgraph(graph, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_renumber_vertices_basic(single_vertex_graph, dask_client):
     F, G, N = single_vertex_graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -194,7 +194,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(
     multi_edge_multi_vertex_graph_1, dask_client
 ):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -215,7 +215,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(
 def test_renumber_edges(abc_graph, dask_client):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -250,7 +250,7 @@ def test_renumber_edges(abc_graph, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     for feature_name, feature_on_types in F.get_feature_list().items():
         for type_name in feature_on_types:
@@ -271,7 +271,7 @@ def test_get_tensor(graph, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor_empty_idx(karate_gnn, dask_client):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     t = cugraph_store.get_tensor(
         CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
@@ -282,7 +282,7 @@ def test_get_tensor_empty_idx(karate_gnn, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_multi_get_tensor(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     for vertex_type in sorted(N.keys()):
         v_ids = np.arange(N[vertex_type])
@@ -309,7 +309,7 @@ def test_multi_get_tensor(graph, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_all_tensor_attrs(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     tensor_attrs = []
     for vertex_type in sorted(N.keys()):
@@ -341,7 +341,7 @@ def test_multi_get_tensor_spec_props(multi_edge_multi_vertex_graph_1, dask_clien
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor_from_tensor_attrs(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -355,7 +355,7 @@ def test_get_tensor_from_tensor_attrs(graph, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor_size(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -371,7 +371,7 @@ def test_get_tensor_size(graph, dask_client):
 )
 def test_get_input_nodes(karate_gnn, dask_client):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
 
     node_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
         (cugraph_store, cugraph_store), "type0"
@@ -384,8 +384,8 @@ def test_get_input_nodes(karate_gnn, dask_client):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_mg_frame_handle(graph, dask_client):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
-    assert isinstance(cugraph_store._CuGraphStore__graph._plc_graph, dict)
+    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
+    assert isinstance(cugraph_store._DaskGraphStore__graph._plc_graph, dict)
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -399,7 +399,7 @@ def test_cugraph_loader_large_index(dask_client):
     F = cugraph.gnn.FeatureStore(backend="torch")
     F.add_data(large_features, "N", "f")
 
-    store = CuGraphStore(
+    store = DaskGraphStore(
         F,
         {("N", "e", "N"): large_index},
         {"N": 1_000_000},
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 9813fa933ee..a67c69b9ebc 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -20,9 +20,9 @@
 import cupy
 import numpy as np
 
-from cugraph_pyg.loader import CuGraphNeighborLoader
+from cugraph_pyg.loader import DaskNeighborLoader
 from cugraph_pyg.loader import BulkSampleLoader
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
 
 from cugraph.gnn import FeatureStore
@@ -49,8 +49,8 @@ def test_cugraph_loader_basic(
     ]
 ):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
         10,
@@ -79,8 +79,8 @@ def test_cugraph_loader_hetero(
     ]
 ):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
-    loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
+    loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
         batch_size=2,
@@ -114,7 +114,7 @@ def test_cugraph_loader_from_disk():
     G = {("t0", "knows", "t0"): 9080}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
@@ -171,7 +171,7 @@ def test_cugraph_loader_from_disk_subset():
     G = {("t0", "knows", "t0"): 9080}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
@@ -230,7 +230,7 @@ def test_cugraph_loader_from_disk_subset_csr():
     G = {("t0", "knows", "t0"): 9080}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     bogus_samples = cudf.DataFrame(
         {
@@ -294,7 +294,7 @@ def test_cugraph_loader_e2e_coo():
     G = {("t0", "knows", "t0"): 9999}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
@@ -362,7 +362,7 @@ def test_cugraph_loader_e2e_csc(framework: str):
     G = {("t0", "knows", "t0"): 9999}
     N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     bogus_samples = cudf.DataFrame(
         {
@@ -467,9 +467,9 @@ def test_drop_last(drop_last):
     F = FeatureStore(backend="torch")
     F.add_data(torch.arange(10), "N", "z")
 
-    store = CuGraphStore(F, G, N)
+    store = DaskGraphStore(F, G, N)
     with tempfile.TemporaryDirectory() as dir:
-        loader = CuGraphNeighborLoader(
+        loader = DaskNeighborLoader(
             (store, store),
             input_nodes=torch.tensor([0, 1, 2, 3, 4]),
             num_neighbors=[1],
@@ -504,8 +504,8 @@ def test_load_directory(
     if directory == "local":
         local_dir = tempfile.TemporaryDirectory(dir=".")
 
-    cugraph_store = CuGraphStore(*karate_gnn)
-    cugraph_loader = CuGraphNeighborLoader(
+    cugraph_store = DaskGraphStore(*karate_gnn)
+    cugraph_loader = DaskNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(8, dtype=torch.int64),
         2,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index e703d477b70..47e114b7618 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 from cugraph_pyg.sampler.cugraph_sampler import (
     _sampler_output_from_sampling_results_heterogeneous,
 )
@@ -31,7 +31,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample(basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -90,7 +90,7 @@ def test_neighbor_sample(basic_graph_1):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, order="CSR")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -151,7 +151,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 def test_neighbor_sample_mock_sampling_results(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
index c99fd447aa0..837743f2aa9 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
@@ -17,7 +17,7 @@
     CuGraphEdgeAttr,
     EdgeLayout,
 )
-from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.data import DaskGraphStore
 
 import cudf
 import cupy
@@ -113,7 +113,7 @@ def test_get_edge_index(graph, edge_index_type):
             G[et][0] = cudf.Series(G[et][0])
             G[et][1] = cudf.Series(G[et][1])
 
-    cugraph_store = CuGraphStore(F, G, N, order="CSC")
+    cugraph_store = DaskGraphStore(F, G, N, order="CSC")
 
     for pyg_can_edge_type in G:
         src, dst = cugraph_store.get_edge_index(
@@ -131,7 +131,7 @@ def test_get_edge_index(graph, edge_index_type):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_edge_types(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     eta = cugraph_store._edge_types_to_attrs
     assert eta.keys() == G.keys()
@@ -147,7 +147,7 @@ def test_edge_types(graph):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_subgraph(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     if len(G.keys()) > 1:
         for edge_type in G.keys():
@@ -165,7 +165,7 @@ def test_get_subgraph(graph):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_renumber_vertices_basic(single_vertex_graph):
     F, G, N = single_vertex_graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -178,7 +178,7 @@ def test_renumber_vertices_basic(single_vertex_graph):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     nodes_of_interest = torch.as_tensor(
         cupy.random.randint(0, sum(N.values()), 3), device="cuda"
@@ -199,7 +199,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph
 def test_renumber_edges(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, order="CSR")
+    graph_store = DaskGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -234,7 +234,7 @@ def test_renumber_edges(abc_graph):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     for feature_name, feature_on_types in F.get_feature_list().items():
         for type_name in feature_on_types:
@@ -255,7 +255,7 @@ def test_get_tensor(graph):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor_empty_idx(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     t = cugraph_store.get_tensor(
         CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
@@ -266,7 +266,7 @@ def test_get_tensor_empty_idx(karate_gnn):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_multi_get_tensor(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     for vertex_type in sorted(N.keys()):
         v_ids = np.arange(N[vertex_type])
@@ -293,7 +293,7 @@ def test_multi_get_tensor(graph):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_all_tensor_attrs(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     tensor_attrs = []
     for vertex_type in sorted(N.keys()):
@@ -333,7 +333,7 @@ def test_multi_get_tensor_spec_props(multi_edge_multi_vertex_graph_1):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor_from_tensor_attrs(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -347,7 +347,7 @@ def test_get_tensor_from_tensor_attrs(graph):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_get_tensor_size(graph):
     F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     tensor_attrs = cugraph_store.get_all_tensor_attrs()
     for tensor_attr in tensor_attrs:
@@ -363,7 +363,7 @@ def test_get_tensor_size(graph):
 )
 def test_get_input_nodes(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     input_node_info = torch_geometric.loader.utils.get_input_nodes(
         (cugraph_store, cugraph_store), "type0"
@@ -387,7 +387,7 @@ def test_serialize(multi_edge_multi_vertex_no_graph_1):
     import pickle
 
     F, G, N = multi_edge_multi_vertex_no_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = DaskGraphStore(F, G, N)
 
     cugraph_store_copy = pickle.loads(pickle.dumps(cugraph_store))
 

From a5dd5798c3baada35dea0cb11e78cdd588d04f55 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 12:03:05 -0700
Subject: [PATCH 57/80] fix sg exapmle

---
 .../cugraph_pyg/examples/cugraph_dist_sampling_sg.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
index e4e074ddc77..de45acc7456 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
@@ -67,7 +67,7 @@ def main():
     el = dataset[0][0]["edge_index"].astype("int64")
 
     with tempfile.TemporaryDirectory() as directory:
-        sample(el, "/home/nfs/abarghi/deleteme/")
+        sample(el, directory)
 
         print("Printing samples...")
         for file in os.listdir(directory):

From 2a672e665f769c5cebedb7ca2d356ac59ed8c512 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 12:05:09 -0700
Subject: [PATCH 58/80] move instructions to python file

---
 python/cugraph-pyg/cugraph_pyg/examples/README.md     | 11 -----------
 .../cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py |  9 +++++++++
 2 files changed, 9 insertions(+), 11 deletions(-)
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/README.md

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/README.md b/python/cugraph-pyg/cugraph_pyg/examples/README.md
deleted file mode 100644
index 572111ac26a..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-This directory contains examples for running cugraph-pyg training.
-
-For single-GPU (SG) scripts, no special configuration is required.
-
-For multi-GPU (MG) scripts, dask must be started first in a separate process.
-To do this, the `start_dask.sh` script has been provided.  This scripts starts
-a dask scheduler and dask workers.  To select the GPUs and amount of memory
-allocated to dask per GPU, the `CUDA_VISIBLE_DEVICES` and `WORKER_RMM_POOL_SIZE`
-arguments in that script can be modified.
-To connect to dask, the scheduler JSON file must be provided.  This can be done
-using the `--dask_scheduler_file` argument in the mg python script being run.
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
index 7fb0bf69aee..1f801a00c1d 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
@@ -11,6 +11,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# For this script, dask must be started first in a separate process.
+# To do this, the `start_dask.sh` script has been provided.  This scripts starts
+# a dask scheduler and dask workers.  To select the GPUs and amount of memory
+# allocated to dask per GPU, the `CUDA_VISIBLE_DEVICES` and `WORKER_RMM_POOL_SIZE`
+# arguments in that script can be modified.
+# To connect to dask, the scheduler JSON file must be provided.  This can be done
+# using the `--dask_scheduler_file` argument in the mg python script being run.
+
+
 
 from ogb.nodeproppred import NodePropPredDataset
 

From 908e24621ac33ac30ca3f98e3b9e4b9dadcd1131 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 12:06:06 -0700
Subject: [PATCH 59/80] style

---
 .../cugraph_pyg/examples/graph_sage_mg.py            |  2 --
 python/cugraph-pyg/cugraph_pyg/loader/__init__.py    |  4 +++-
 .../cugraph_pyg/tests/mg/test_mg_cugraph_loader.py   |  2 +-
 .../cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py  |  2 +-
 .../cugraph_pyg/tests/test_cugraph_loader.py         |  2 +-
 .../cugraph_pyg/tests/test_cugraph_sampler.py        |  2 +-
 python/nx-cugraph/README.md                          | 12 ++++++------
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
index 1f801a00c1d..145675c8a06 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
@@ -19,8 +19,6 @@
 # To connect to dask, the scheduler JSON file must be provided.  This can be done
 # using the `--dask_scheduler_file` argument in the mg python script being run.
 
-
-
 from ogb.nodeproppred import NodePropPredDataset
 
 import time
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index 384329c11c6..cad66aaa183 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -22,5 +22,7 @@
 
 
 def CuGraphNeighborLoader(*args, **kwargs):
-    warnings.warn("CuGraphNeighborLoader has been renamed to DaskNeighborLoader", FutureWarning)
+    warnings.warn(
+        "CuGraphNeighborLoader has been renamed to DaskNeighborLoader", FutureWarning
+    )
     return DaskNeighborLoader(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
index 7be67b3b1aa..02d96fc20ad 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
index eb421fd9180..89fcde5a278 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index a67c69b9ebc..f64b9fd5bad 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index 47e114b7618..772bc5ebfa2 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 75b5c1c5aa9..df80ae56a96 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From 25add46ee584cb4dd10c50e3d728f80899673770 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 12:06:17 -0700
Subject: [PATCH 60/80] revert nx-cugraph

---
 python/nx-cugraph/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index df80ae56a96..75b5c1c5aa9 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From b45a1a5f490f506a8cb3c52a03bf7f2cb9ce5ac1 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 12:14:56 -0700
Subject: [PATCH 61/80] minor example cleanup

---
 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
index dfb49210143..65c6aee9770 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
@@ -293,7 +293,7 @@ def run_train(
         world_size = torch.cuda.device_count()
     else:
         world_size = args.n_devices
-    print("Let's use", world_size, "GPUs!")
+    print("Using", world_size, "GPUs...")
 
     # Create the uid needed for cuGraph comms
     cugraph_id = cugraph_comms_create_unique_id()

From c557adf69c1d050e8de6201250bc8f17baa47a62 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 12:26:17 -0700
Subject: [PATCH 62/80] refactor tests

---
 .../test_dask_graph_store.py}                             | 0
 .../test_dask_graph_store_mg.py}                          | 0
 .../test_dask_neighbor_loader.py}                         | 0
 .../test_dask_neighbor_loader_mg.py}                      | 0
 .../test_sampler_utils.py}                                | 8 +-------
 .../test_sampler_utils_mg.py}                             | 2 +-
 6 files changed, 2 insertions(+), 8 deletions(-)
 rename python/cugraph-pyg/cugraph_pyg/tests/{test_cugraph_store.py => data/test_dask_graph_store.py} (100%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{mg/test_mg_cugraph_store.py => data/test_dask_graph_store_mg.py} (100%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{test_cugraph_loader.py => loader/test_dask_neighbor_loader.py} (100%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{mg/test_mg_cugraph_loader.py => loader/test_dask_neighbor_loader_mg.py} (100%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{test_cugraph_sampler.py => sampler/test_sampler_utils.py} (96%)
 rename python/cugraph-pyg/cugraph_pyg/tests/{mg/test_mg_cugraph_sampler.py => sampler/test_sampler_utils_mg.py} (99%)

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
similarity index 100%
rename from python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
rename to python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
similarity index 100%
rename from python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
rename to python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
similarity index 100%
rename from python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
rename to python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
similarity index 100%
rename from python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
rename to python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
similarity index 96%
rename from python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
rename to python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
index 772bc5ebfa2..913928e4dd0 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
@@ -17,7 +17,7 @@
 import pytest
 
 from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import (
+from cugraph_pyg.sampler.sampler_utils import (
     _sampler_output_from_sampling_results_heterogeneous,
 )
 
@@ -191,9 +191,3 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
     assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
     assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skip("needs to be written")
-def test_neighbor_sample_renumbered():
-    pass
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
similarity index 99%
rename from python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
rename to python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
index 89fcde5a278..fb6ff65422f 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
@@ -17,7 +17,7 @@
 import pytest
 
 from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import (
+from cugraph_pyg.sampler.sampler_utils import (
     _sampler_output_from_sampling_results_heterogeneous,
 )
 

From e982bc683124da91130e22daa04921cbedf61a7f Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 13:38:59 -0700
Subject: [PATCH 63/80] update tests

---
 ci/run_cugraph_pyg_pytests.sh                 |  2 +-
 ci/test.sh                                    |  2 +-
 ci/test_wheel_cugraph-pyg.sh                  |  2 +-
 .../tests/data/test_dask_graph_store.py       | 26 ++++++++++++-------
 .../tests/data/test_dask_graph_store_mg.py    | 25 +++++++++++-------
 .../tests/loader/test_dask_neighbor_loader.py |  9 +++++++
 .../loader/test_dask_neighbor_loader_mg.py    |  2 ++
 .../cugraph_pyg/tests/nn/test_gat_conv.py     |  3 ++-
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py   |  3 ++-
 .../tests/nn/test_hetero_gat_conv.py          |  3 ++-
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py    |  3 ++-
 .../cugraph_pyg/tests/nn/test_sage_conv.py    |  3 ++-
 .../tests/nn/test_transformer_conv.py         |  3 ++-
 .../tests/sampler/test_sampler_utils.py       |  3 +++
 .../tests/sampler/test_sampler_utils_mg.py    |  9 +++----
 python/cugraph-pyg/pytest.ini                 |  2 ++
 16 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh
index 88642e6ceb6..37c33c53446 100755
--- a/ci/run_cugraph_pyg_pytests.sh
+++ b/ci/run_cugraph_pyg_pytests.sh
@@ -6,7 +6,7 @@ set -euo pipefail
 # Support invoking run_cugraph_pyg_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg
 
-pytest --cache-clear --ignore=tests/mg "$@" .
+pytest --cache-clear --benchmark-disable "$@" .
 
 # Test examples
 for e in "$(pwd)"/examples/*.py; do
diff --git a/ci/test.sh b/ci/test.sh
index f20fc40f85a..884ed7ac881 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -103,7 +103,7 @@ if hasArg "--run-python-tests"; then
     conda list
     cd ${CUGRAPH_ROOT}/python/cugraph-pyg/cugraph_pyg
     # rmat is not tested because of MG testing
-    pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --ignore=tests/mg --ignore=tests/int --ignore=tests/generators --benchmark-disable
+    pytest -sv -m sg --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --benchmark-disable
     echo "Ran Python pytest for cugraph_pyg : return code was: $?, test script exit code is now: $EXITCODE"
 
     echo "Python pytest for cugraph-service (single-GPU only)..."
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index e98bf4ab56b..6a18d8defda 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -47,7 +47,7 @@ rapids-logger "pytest cugraph-pyg (single GPU)"
 pushd python/cugraph-pyg/cugraph_pyg
 python -m pytest \
   --cache-clear \
-  --ignore=tests/mg \
+  --benchmark-disable \
   tests
 # Test examples
 for e in "$(pwd)"/examples/*.py; do
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
index 837743f2aa9..bd608b325f4 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
@@ -33,6 +33,7 @@
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_tensor_attr():
     ta = CuGraphTensorAttr("group0", "property1")
     assert not ta.is_fully_specified()
@@ -63,6 +64,7 @@ def test_tensor_attr():
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_edge_attr():
     ea = CuGraphEdgeAttr("type0", EdgeLayout.COO, False, 10)
     assert ea.edge_type == "type0"
@@ -98,6 +100,7 @@ def single_vertex_graph(request):
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.parametrize("edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf"])
+@pytest.mark.sg
 def test_get_edge_index(graph, edge_index_type):
     F, G, N = graph
     if "torch" in edge_index_type:
@@ -129,6 +132,7 @@ def test_get_edge_index(graph, edge_index_type):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_edge_types(graph):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -145,6 +149,7 @@ def test_edge_types(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_subgraph(graph):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -163,6 +168,7 @@ def test_get_subgraph(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_renumber_vertices_basic(single_vertex_graph):
     F, G, N = single_vertex_graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -176,6 +182,7 @@ def test_renumber_vertices_basic(single_vertex_graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
     cugraph_store = DaskGraphStore(F, G, N)
@@ -196,6 +203,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_renumber_edges(abc_graph):
     F, G, N = abc_graph
 
@@ -232,6 +240,7 @@ def test_renumber_edges(abc_graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor(graph):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -253,6 +262,7 @@ def test_get_tensor(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor_empty_idx(karate_gnn):
     F, G, N = karate_gnn
     cugraph_store = DaskGraphStore(F, G, N)
@@ -264,6 +274,7 @@ def test_get_tensor_empty_idx(karate_gnn):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_multi_get_tensor(graph):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -291,6 +302,7 @@ def test_multi_get_tensor(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_all_tensor_attrs(graph):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -320,17 +332,8 @@ def test_get_all_tensor_attrs(graph):
     )
 
 
-@pytest.mark.skip("not implemented")
-def test_get_tensor_spec_props(graph):
-    raise NotImplementedError("not implemented")
-
-
-@pytest.mark.skip("not implemented")
-def test_multi_get_tensor_spec_props(multi_edge_multi_vertex_graph_1):
-    raise NotImplementedError("not implemented")
-
-
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor_from_tensor_attrs(graph):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -345,6 +348,7 @@ def test_get_tensor_from_tensor_attrs(graph):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_get_tensor_size(graph):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N)
@@ -361,6 +365,7 @@ def test_get_tensor_size(graph):
 @pytest.mark.skipif(
     isinstance(torch_geometric, MissingModule), reason="pyg not available"
 )
+@pytest.mark.sg
 def test_get_input_nodes(karate_gnn):
     F, G, N = karate_gnn
     cugraph_store = DaskGraphStore(F, G, N)
@@ -383,6 +388,7 @@ def test_get_input_nodes(karate_gnn):
     assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
 
 
+@pytest.mark.sg
 def test_serialize(multi_edge_multi_vertex_no_graph_1):
     import pickle
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
index edbd0ec8cc2..771271adf8d 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
@@ -101,6 +101,7 @@ def single_vertex_graph(request):
 @pytest.mark.parametrize(
     "edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf", "dask-cudf"]
 )
+@pytest.mark.mg
 def test_get_edge_index(graph, edge_index_type, dask_client):
     F, G, N = graph
     if "torch" in edge_index_type:
@@ -143,6 +144,7 @@ def test_get_edge_index(graph, edge_index_type, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_edge_types(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -159,6 +161,7 @@ def test_edge_types(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_subgraph(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -177,6 +180,7 @@ def test_get_subgraph(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_renumber_vertices_basic(single_vertex_graph, dask_client):
     F, G, N = single_vertex_graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -190,6 +194,7 @@ def test_renumber_vertices_basic(single_vertex_graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_renumber_vertices_multi_edge_multi_vertex(
     multi_edge_multi_vertex_graph_1, dask_client
 ):
@@ -212,6 +217,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_renumber_edges(abc_graph, dask_client):
     F, G, N = abc_graph
 
@@ -248,6 +254,7 @@ def test_renumber_edges(abc_graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -269,6 +276,7 @@ def test_get_tensor(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor_empty_idx(karate_gnn, dask_client):
     F, G, N = karate_gnn
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -280,6 +288,7 @@ def test_get_tensor_empty_idx(karate_gnn, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_multi_get_tensor(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -307,6 +316,7 @@ def test_multi_get_tensor(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_all_tensor_attrs(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -328,17 +338,8 @@ def test_get_all_tensor_attrs(graph, dask_client):
     )
 
 
-@pytest.mark.skip("not implemented")
-def test_get_tensor_spec_props(graph, dask_client):
-    raise NotImplementedError("not implemented")
-
-
-@pytest.mark.skip("not implemented")
-def test_multi_get_tensor_spec_props(multi_edge_multi_vertex_graph_1, dask_client):
-    raise NotImplementedError("not implemented")
-
-
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor_from_tensor_attrs(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -353,6 +354,7 @@ def test_get_tensor_from_tensor_attrs(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_get_tensor_size(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -369,6 +371,7 @@ def test_get_tensor_size(graph, dask_client):
 @pytest.mark.skipif(
     isinstance(torch_geometric, MissingModule), reason="pyg not available"
 )
+@pytest.mark.mg
 def test_get_input_nodes(karate_gnn, dask_client):
     F, G, N = karate_gnn
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -382,6 +385,7 @@ def test_get_input_nodes(karate_gnn, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_mg_frame_handle(graph, dask_client):
     F, G, N = graph
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
@@ -389,6 +393,7 @@ def test_mg_frame_handle(graph, dask_client):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_cugraph_loader_large_index(dask_client):
     large_index = (
         np.random.randint(0, 1_000_000, (100_000_000,)),
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
index f64b9fd5bad..ab7b6dacd20 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
@@ -43,6 +43,7 @@
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_basic(
     karate_gnn: Tuple[
         FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
@@ -73,6 +74,7 @@ def test_cugraph_loader_basic(
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_hetero(
     karate_gnn: Tuple[
         FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
@@ -103,6 +105,7 @@ def test_cugraph_loader_hetero(
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_from_disk():
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     n = torch.arange(1, 1 + len(m), dtype=torch.int32)
@@ -160,6 +163,7 @@ def test_cugraph_loader_from_disk():
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_from_disk_subset():
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     n = torch.arange(1, 1 + len(m), dtype=torch.int32)
@@ -219,6 +223,7 @@ def test_cugraph_loader_from_disk_subset():
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
+@pytest.mark.sg
 def test_cugraph_loader_from_disk_subset_csr():
     m = [2, 9, 99, 82, 11, 13]
     n = torch.arange(1, 1 + len(m), dtype=torch.int32)
@@ -285,6 +290,7 @@ def test_cugraph_loader_from_disk_subset_csr():
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_cugraph_loader_e2e_coo():
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     x = torch.randint(3000, (256, 256)).to(torch.float32)
@@ -353,6 +359,7 @@ def test_cugraph_loader_e2e_coo():
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
 @pytest.mark.parametrize("framework", ["pyg", "cugraph-ops"])
+@pytest.mark.sg
 def test_cugraph_loader_e2e_csc(framework: str):
     m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
     x = torch.randint(3000, (256, 256)).to(torch.float32)
@@ -457,6 +464,7 @@ def test_cugraph_loader_e2e_csc(framework: str):
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.parametrize("drop_last", [True, False])
+@pytest.mark.sg
 def test_drop_last(drop_last):
     N = {"N": 10}
     G = {
@@ -495,6 +503,7 @@ def test_drop_last(drop_last):
 
 
 @pytest.mark.parametrize("directory", ["local", "temp"])
+@pytest.mark.sg
 def test_load_directory(
     karate_gnn: Tuple[
         FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
index 02d96fc20ad..9e8a85a5b67 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
@@ -21,6 +21,7 @@
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_cugraph_loader_basic(dask_client, karate_gnn):
     F, G, N = karate_gnn
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
@@ -49,6 +50,7 @@ def test_cugraph_loader_basic(dask_client, karate_gnn):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_cugraph_loader_hetero(dask_client, karate_gnn):
     F, G, N = karate_gnn
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index 62bebb9211d..49caee1dc11 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -25,6 +25,7 @@
 @pytest.mark.parametrize("max_num_neighbors", [8, None])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_gat_conv_equality(
     bias, bipartite, concat, heads, max_num_neighbors, use_edge_attr, graph, request
 ):
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index a4794628410..fd07f446a17 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, request):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
index 1c841a17df7..ef98d686032 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -29,6 +29,7 @@
 )
 @pytest.mark.parametrize("heads", [1, 3, 10])
 @pytest.mark.parametrize("aggr", ["sum", "mean"])
+@pytest.mark.sg
 def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
     major, minor, patch = torch_geometric.__version__.split(".")[:3]
     pyg_version = tuple(map(int, [major, minor, patch]))
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
index ded4f300c0c..631faf355aa 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -24,6 +24,7 @@
 @pytest.mark.parametrize("num_bases", [1, 2, None])
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_rgcn_conv_equality(
     aggr, bias, max_num_neighbors, num_bases, root_weight, graph, request
 ):
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
index b2977d1d175..8ef0dce60ae 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -25,6 +25,7 @@
 @pytest.mark.parametrize("normalize", [True, False])
 @pytest.mark.parametrize("root_weight", [True, False])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_sage_conv_equality(
     aggr, bias, bipartite, max_num_neighbors, normalize, root_weight, graph, request
 ):
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index fbdb244898b..10a51a0c003 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,6 +22,7 @@
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+@pytest.mark.sg
 def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
     pytest.importorskip("torch_geometric", reason="PyG not available")
     import torch
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
index 913928e4dd0..58f495b2462 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
@@ -29,6 +29,7 @@
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_neighbor_sample(basic_graph_1):
     F, G, N = basic_graph_1
     cugraph_store = DaskGraphStore(F, G, N, order="CSR")
@@ -88,6 +89,7 @@ def test_neighbor_sample(basic_graph_1):
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
     cugraph_store = DaskGraphStore(F, G, N, order="CSR")
@@ -148,6 +150,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
 def test_neighbor_sample_mock_sampling_results(abc_graph):
     F, G, N = abc_graph
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
index fb6ff65422f..2178ada771d 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
@@ -31,6 +31,7 @@
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_neighbor_sample(dask_client, basic_graph_1):
     F, G, N = basic_graph_1
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
@@ -96,6 +97,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 @pytest.mark.skip(reason="broken")
+@pytest.mark.mg
 def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
     cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
@@ -160,6 +162,7 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
 def test_neighbor_sample_mock_sampling_results(dask_client):
     N = {
         "A": 2,  # 0, 1
@@ -230,9 +233,3 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
     assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
     assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
     assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skip("needs to be written")
-def test_neighbor_sample_renumbered(dask_client):
-    pass
diff --git a/python/cugraph-pyg/pytest.ini b/python/cugraph-pyg/pytest.ini
index 579b2245308..db99a54ae49 100644
--- a/python/cugraph-pyg/pytest.ini
+++ b/python/cugraph-pyg/pytest.ini
@@ -23,6 +23,8 @@ addopts =
 markers =
           slow: slow-running tests/benchmarks
           cugraph_ops: Tests requiring cugraph-ops
+          mg: Test MG code paths - number of gpu > 1
+          sg: Test SG code paths and dask sg tests - number of gpu == 1
 
 python_classes =
           Bench*

From e9040ca4b3c98451419537ac1c4ec231bb132d17 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 13:44:53 -0700
Subject: [PATCH 64/80] fix bad import

---
 .../cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py | 2 +-
 .../cugraph_pyg/tests/data/test_dask_graph_store_mg.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
index bd608b325f4..0a997a960b8 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import cugraph
-from cugraph_pyg.data.cugraph_store import (
+from cugraph_pyg.data.dask_graph_store import (
     CuGraphTensorAttr,
     CuGraphEdgeAttr,
     EdgeLayout,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
index 771271adf8d..80eb1a09afe 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import cugraph
-from cugraph_pyg.data.cugraph_store import (
+from cugraph_pyg.data.dask_graph_store import (
     CuGraphTensorAttr,
     CuGraphEdgeAttr,
     EdgeLayout,

From e2f4e9679178f6b976a3434e4a967df23dac24d2 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 14:11:18 -0700
Subject: [PATCH 65/80] add basic api test

---
 .../tests/data/test_graph_store.py            | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
new file mode 100644
index 00000000000..a8b93665aad
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import GraphStore
+
+torch = import_optional("torch")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+def test_graph_store_basic_api():
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+
+    graph_store = GraphStore()
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
+
+    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
+
+    assert (ei == rei).all()
+
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 1
+
+    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 0

From 887a7fe81fa551188ec680caa25d43d0fc6b9255 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 14:31:17 -0700
Subject: [PATCH 66/80] feature store tests, fix

---
 .../cugraph_pyg/data/feature_store.py         |  2 +-
 .../tests/data/test_feature_store.py          | 44 +++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py

diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index 20a9ecdc359..42dda42a9e1 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -102,7 +102,7 @@ def _remove_tensor(
         if attr.group_name not in self.__features:
             return False
 
-        if attr.attr_name not in self.__features[attr.group_name]:
+        if attr.attr_name not in self.__features[attr.group_name].keys():
             return False
 
         del self.__features[attr.group_name][attr.attr_name]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
new file mode 100644
index 00000000000..ab5f1e217bb
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import TensorDictFeatureStore
+
+torch = import_optional("torch")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+def test_tensordict_feature_store_basic_api():
+    feature_store = TensorDictFeatureStore()
+
+    node_features_0 = torch.randint(128, (100, 1000))
+    node_features_1 = torch.randint(256, (100, 10))
+
+    other_features = torch.randint(1024, (10, 5))
+
+    feature_store["node", "feat0"] = node_features_0
+    feature_store["node", "feat1"] = node_features_1
+    feature_store["other", "feat"] = other_features
+
+    assert (feature_store["node"]["feat0"][:] == node_features_0).all()
+    assert (feature_store["node"]["feat1"][:] == node_features_1).all()
+    assert (feature_store["other"]["feat"][:] == other_features).all()
+
+    assert len(feature_store.get_all_tensor_attrs()) == 3
+
+    del feature_store["node", "feat0"]
+    assert len(feature_store.get_all_tensor_attrs()) == 2

From 4a33bde236c15eacb0a26482ef8b1de25d9762a8 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Tue, 14 May 2024 15:50:27 -0700
Subject: [PATCH 67/80] wrap up tests

---
 .../cugraph_pyg/data/graph_store.py           |  18 ++-
 .../tests/data/test_graph_store_mg.py         |  45 +++++++
 .../tests/loader/test_neighbor_loader.py      |  54 +++++++++
 .../tests/loader/test_neighbor_loader_mg.py   | 111 ++++++++++++++++++
 .../cugraph/gnn/data_loading/dist_sampler.py  |   4 +-
 5 files changed, 225 insertions(+), 7 deletions(-)
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py

diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 9bac04fc3b0..01af7fd6ed0 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -147,8 +147,8 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                 self.__graph = pylibcugraph.MGGraph(
                     self._resource_handle,
                     graph_properties,
-                    [cupy.asarray(edgelist_dict["src"])],
-                    [cupy.asarray(edgelist_dict["dst"])],
+                    [cupy.asarray(edgelist_dict["src"]).astype("int64")],
+                    [cupy.asarray(edgelist_dict["dst"]).astype("int64")],
                     vertices_array=[vertices_array],
                     edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
                     edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
@@ -157,8 +157,8 @@ def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
                 self.__graph = pylibcugraph.SGGraph(
                     self._resource_handle,
                     graph_properties,
-                    cupy.asarray(edgelist_dict["src"]),
-                    cupy.asarray(edgelist_dict["dst"]),
+                    cupy.asarray(edgelist_dict["src"]).astype("int64"),
+                    cupy.asarray(edgelist_dict["dst"]).astype("int64"),
                     vertices_array=cupy.arange(
                         sum(self._num_vertices().values()), dtype="int64"
                     ),
@@ -184,14 +184,20 @@ def _num_vertices(self) -> Dict[str, int]:
                 )
             else:
                 if edge_attr.edge_type[0] not in num_vertices:
-                    num_vertices[edge_attr.edge_type[0]] = (
+                    num_vertices[edge_attr.edge_type[0]] = int(
                         self.__edge_indices[edge_attr.edge_type][0].max() + 1
                     )
                 if edge_attr.edge_type[2] not in num_vertices:
-                    num_vertices[edge_attr.edge_type[1]] = (
+                    num_vertices[edge_attr.edge_type[1]] = int(
                         self.__edge_indices[edge_attr.edge_type][1].max() + 1
                     )
 
+        if self.is_multi_gpu:
+            vtypes = num_vertices.keys()
+            for vtype in vtypes:
+                sz = torch.tensor(num_vertices[vtype], device="cuda")
+                torch.distributed.all_reduce(sz, op=torch.distributed.ReduceOp.MAX)
+                num_vertices[vtype] = int(sz)
         return num_vertices
 
     @property
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
new file mode 100644
index 00000000000..14540b7e17d
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import GraphStore
+
+torch = import_optional("torch")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+def test_graph_store_basic_api_mg():
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+
+    graph_store = GraphStore(is_multi_gpu=True)
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
+
+    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
+
+    assert (ei == rei).all()
+
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 1
+
+    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
+    edge_attrs = graph_store.get_all_edge_attrs()
+    assert len(edge_attrs) == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
new file mode 100644
index 00000000000..8edb5276953
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
+from cugraph_pyg.loader import NeighborLoader
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.sg
+def test_neighbor_loader():
+    """
+    Basic e2e test that covers loading and sampling.
+    """
+
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+
+    graph_store = GraphStore()
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
+
+    feature_store = TensorDictFeatureStore()
+    feature_store["person", "feat"] = torch.randint(128, (34, 16))
+
+    loader = NeighborLoader(
+        (feature_store, graph_store),
+        [5, 5],
+        input_nodes=torch.arange(34),
+        directory=".",
+    )
+
+    for batch in loader:
+        assert isinstance(batch, torch_geometric.data.Data)
+        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
new file mode 100644
index 00000000000..6a5f46b0940
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import os
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
+from cugraph_pyg.loader import NeighborLoader
+
+from cugraph.gnn import (
+    cugraph_comms_init,
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+)
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+def init_pytorch_worker(rank, world_size, cugraph_id):
+    import rmm
+
+    rmm.reinitialize(
+        devices=rank,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    torch.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
+
+
+def run_test_neighbor_loader_mg(rank, uid, world_size, specify_size):
+    """
+    Basic e2e test that covers loading and sampling.
+    """
+    init_pytorch_worker(rank, world_size, uid)
+
+    df = karate.get_edgelist()
+    src = torch.as_tensor(df["src"], device="cuda")
+    dst = torch.as_tensor(df["dst"], device="cuda")
+
+    ei = torch.stack([dst, src])
+    ei = torch.tensor_split(ei.clone(), world_size, axis=1)[rank]
+
+    sz = (34, 34) if specify_size else None
+    graph_store = GraphStore(is_multi_gpu=True)
+    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo", False, sz)
+
+    feature_store = TensorDictFeatureStore()
+    feature_store["person", "feat"] = torch.randint(128, (34, 16))
+
+    ix_train = torch.tensor_split(torch.arange(34), world_size, axis=0)[rank]
+
+    loader = NeighborLoader(
+        (feature_store, graph_store),
+        [5, 5],
+        input_nodes=ix_train,
+    )
+
+    for batch in loader:
+        assert isinstance(batch, torch_geometric.data.Data)
+        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.parametrize("specify_size", [True, False])
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.mg
+def test_neighbor_loader_mg(specify_size):
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_neighbor_loader_mg,
+        args=(
+            uid,
+            world_size,
+            specify_size,
+        ),
+        nprocs=world_size,
+    )
diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index 20e0391fff1..724acef546c 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -55,7 +55,9 @@ def __init__(
             ex = re.compile(r"batch\=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet")
             filematch = [ex.match(f) for f in files]
             filematch = [f for f in filematch if f]
-            filematch = [f for f in filematch if int(f[1]) == rank]
+
+            if rank is not None:
+                filematch = [f for f in filematch if int(f[1]) == rank]
 
             batch_count = sum([int(f[4]) - int(f[2]) + 1 for f in filematch])
             filematch = sorted(filematch, key=lambda f: int(f[2]), reverse=True)

From 54ab2acb442658648acc3b73c98d4ea58beed97b Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 17 May 2024 13:17:11 -0700
Subject: [PATCH 68/80] style

---
 python/nx-cugraph/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 75b5c1c5aa9..df80ae56a96 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From 64b21bed448649060ac1446c6b209e039900e811 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 17 May 2024 13:17:51 -0700
Subject: [PATCH 69/80] revert nx change

---
 python/nx-cugraph/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index df80ae56a96..75b5c1c5aa9 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -106,16 +106,16 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#shortest-path-betweenness">betweenness</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#degree">degree_alg</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">eigenvector</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#eigenvector">katz</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
@@ -126,12 +126,12 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#connectivity">connected</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
  │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
  │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#weak-connectivity">weakly_connected</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>

From 47c897dc3fac4d46a2d0e9170bdc7117b1117be1 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 17 May 2024 13:26:10 -0700
Subject: [PATCH 70/80] rename snmg example

---
 .../examples/{gcn_dist_mg.py => gcn_dist_snmg.py}        | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)
 rename python/cugraph-pyg/cugraph_pyg/examples/{gcn_dist_mg.py => gcn_dist_snmg.py} (98%)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
similarity index 98%
rename from python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
rename to python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
index 65c6aee9770..585d5427ff0 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Single-node, multi-GPU example.
+
 import argparse
 import os
 import tempfile
@@ -265,13 +267,6 @@ def run_train(
     parser.add_argument("--dataset_root", type=str, default="dataset")
     parser.add_argument("--dataset", type=str, default="ogbn-products")
 
-    parser.add_argument(
-        "--n_devices",
-        type=int,
-        default=-1,
-        help="1-8 to use that many GPUs. Defaults to all available GPUs",
-    )
-
     args = parser.parse_args()
     wall_clock_start = time.perf_counter()
 

From 72b5d76e6648ee398a7ce401e82de51af8f65df9 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 17 May 2024 13:34:56 -0700
Subject: [PATCH 71/80] fix sampler test

---
 .../tests/sampler/test_sampler_utils.py          |  6 +++---
 .../tests/sampler/test_sampler_utils_mg.py       | 16 ++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
index 6f567de3672..7659fdc386f 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
@@ -191,6 +191,6 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
+    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
+    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
+    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
index 2178ada771d..91e0668b3c1 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
@@ -88,10 +88,10 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"].tolist() == [4, 1]
+    assert out.num_sampled_nodes["vt1"] == [4, 1]
 
     assert len(out.num_sampled_edges) == 1
-    assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
+    assert out.num_sampled_edges[("vt1", "pig", "vt1")] == [6]
 
 
 @pytest.mark.cugraph_ops
@@ -225,11 +225,11 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
     assert out.col[("B", "ba", "A")].tolist() == [1, 1]
 
     assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 0, 0, 0]
-    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 0, 0]
-    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 1]
+    assert out.num_sampled_nodes["A"] == [2, 0, 0, 0, 0]
+    assert out.num_sampled_nodes["B"] == [0, 2, 0, 0, 0]
+    assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
+    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
+    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
+    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]

From 6e54068b8b2a2ccfa6162f0550378e40c0843892 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Fri, 17 May 2024 18:08:20 -0400
Subject: [PATCH 72/80] Update gcn_dist_snmg.py

---
 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
index 585d5427ff0..632fdf73f23 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -267,6 +267,13 @@ def run_train(
     parser.add_argument("--dataset_root", type=str, default="dataset")
     parser.add_argument("--dataset", type=str, default="ogbn-products")
 
+    parser.add_argument(
+        "--n_devices",
+        type=int,
+        default=-1,
+        help="1-8 to use that many GPUs. Defaults to all available GPUs",
+    )
+    
     args = parser.parse_args()
     wall_clock_start = time.perf_counter()
 

From 3f8927cb2f2c40dc85370f4e690b8d14ba4de947 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 22 May 2024 14:31:27 -0700
Subject: [PATCH 73/80] style

---
 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
index 632fdf73f23..2eff2c12d74 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -273,7 +273,7 @@ def run_train(
         default=-1,
         help="1-8 to use that many GPUs. Defaults to all available GPUs",
     )
-    
+
     args = parser.parse_args()
     wall_clock_start = time.perf_counter()
 

From 31090db492d266b05fd87ba6a92cd856453a203d Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Thu, 23 May 2024 15:09:59 -0400
Subject: [PATCH 74/80] install tensordict in wheel test

---
 ci/test_wheel_cugraph-pyg.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index fdf11e99c79..984a46a1c66 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -41,6 +41,7 @@ rapids-retry python -m pip install \
   torch_sparse \
   torch_cluster \
   torch_spline_conv \
+  tensordict \
   -f ${PYG_URL}
 
 rapids-logger "pytest cugraph-pyg (single GPU)"

From 300edb8484368099a93c6b8eb1026ad5ac87eb75 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Thu, 23 May 2024 12:14:24 -0700
Subject: [PATCH 75/80] install tensordict in python tests

---
 ci/test_python.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 5ea893eca60..a665bf20bb2 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -219,7 +219,10 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
 
     # Install pyg dependencies (which requires pip)
 
-    pip install ogb
+    pip install \
+      ogb \
+      tensordict
+
     pip install \
         pyg_lib \
         torch_scatter \

From 9dc757af521318cd9d8f2d665cee1e42a1390938 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 24 May 2024 06:50:42 -0700
Subject: [PATCH 76/80] don't install torch-sparse, torch-spline-conv to fix ci
 issue

---
 ci/test_python.sh            | 2 --
 ci/test_wheel_cugraph-pyg.sh | 2 --
 2 files changed, 4 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index a665bf20bb2..50913263b32 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -227,8 +227,6 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
         pyg_lib \
         torch_scatter \
         torch_sparse \
-        torch_cluster \
-        torch_spline_conv \
       -f ${PYG_URL}
 
     rapids-print-env
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index 984a46a1c66..ca91bb0a1a6 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -39,8 +39,6 @@ rapids-retry python -m pip install \
   pyg_lib \
   torch_scatter \
   torch_sparse \
-  torch_cluster \
-  torch_spline_conv \
   tensordict \
   -f ${PYG_URL}
 

From d2fff159e7b53bda5672e429e93e723512cf7953 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Fri, 24 May 2024 13:26:27 -0700
Subject: [PATCH 77/80] don't call as array on 'None'

---
 python/cugraph/cugraph/gnn/data_loading/dist_sampler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
index 724acef546c..52638230b9b 100644
--- a/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/dist_sampler.py
@@ -762,7 +762,9 @@ def sample_batches(
                 compression=self.__compression,
                 compress_per_hop=self.__compress_per_hop,
                 retain_seeds=self._retain_original_seeds,
-                label_offsets=cupy.asarray(label_offsets),
+                label_offsets=None
+                if label_offsets is None
+                else cupy.asarray(label_offsets),
                 return_dict=True,
             )
             sampling_results_dict["rank"] = rank

From 0551718f25c10515f650039d304ab6362df6e19c Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 29 May 2024 10:47:20 -0700
Subject: [PATCH 78/80] clean up snmg example

---
 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
index 2eff2c12d74..859205ce6db 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -106,7 +106,7 @@ def run_train(
     from cugraph_pyg.loader import NeighborLoader
 
     graph_store = GraphStore(is_multi_gpu=True)
-    ixr = torch.tensor_split(data.edge_index, world_size, axis=1)[rank]
+    ixr = torch.tensor_split(data.edge_index, world_size, dim=1)[rank]
     graph_store[
         ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
     ] = ixr
@@ -244,7 +244,6 @@ def run_train(
             print(
                 f"Test Accuracy: {acc_test * 100.0:.4f}%",
             )
-    # dist.barrier()
 
     if rank == 0:
         total_time = round(time.perf_counter() - wall_clock_start, 2)
@@ -252,6 +251,7 @@ def run_train(
         print("total_time - prep_time =", total_time - prep_time, "seconds")
 
     cugraph_comms_shutdown()
+    dist.destroy_process_group()
 
 
 if __name__ == "__main__":

From 0b2156e8b0d4c265aa15ba592dca439a804f7cf7 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 29 May 2024 14:14:15 -0700
Subject: [PATCH 79/80] skip snmg example in CI

---
 .../cugraph_pyg/examples/gcn_dist_snmg.py     | 124 +++++++++---------
 1 file changed, 65 insertions(+), 59 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
index 859205ce6db..7d24204d745 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -17,6 +17,7 @@
 import os
 import tempfile
 import time
+import warnings
 
 import torch
 import torch.distributed as dist
@@ -255,68 +256,73 @@ def run_train(
 
 
 if __name__ == "__main__":
+    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == 1:
+        warnings.warn("Skipping SMNG example in CI due to memory limit")
+    else:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--hidden_channels", type=int, default=256)
+        parser.add_argument("--num_layers", type=int, default=2)
+        parser.add_argument("--lr", type=float, default=0.001)
+        parser.add_argument("--epochs", type=int, default=4)
+        parser.add_argument("--batch_size", type=int, default=1024)
+        parser.add_argument("--fan_out", type=int, default=30)
+        parser.add_argument("--tempdir_root", type=str, default=None)
+        parser.add_argument("--dataset_root", type=str, default="dataset")
+        parser.add_argument("--dataset", type=str, default="ogbn-products")
+
+        parser.add_argument(
+            "--n_devices",
+            type=int,
+            default=-1,
+            help="1-8 to use that many GPUs. Defaults to all available GPUs",
+        )
 
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=256)
-    parser.add_argument("--num_layers", type=int, default=2)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=1024)
-    parser.add_argument("--fan_out", type=int, default=30)
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--dataset", type=str, default="ogbn-products")
-
-    parser.add_argument(
-        "--n_devices",
-        type=int,
-        default=-1,
-        help="1-8 to use that many GPUs. Defaults to all available GPUs",
-    )
-
-    args = parser.parse_args()
-    wall_clock_start = time.perf_counter()
+        args = parser.parse_args()
+        wall_clock_start = time.perf_counter()
 
-    from rmm.allocators.torch import rmm_torch_allocator
+        from rmm.allocators.torch import rmm_torch_allocator
 
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
-    dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
-    split_idx = dataset.get_idx_split()
-    data = dataset[0]
-    data.y = data.y.reshape(-1)
+        dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
+        split_idx = dataset.get_idx_split()
+        data = dataset[0]
+        data.y = data.y.reshape(-1)
 
-    model = torch_geometric.nn.models.GCN(
-        dataset.num_features, args.hidden_channels, args.num_layers, dataset.num_classes
-    )
-
-    print("Data =", data)
-    if args.n_devices == -1:
-        world_size = torch.cuda.device_count()
-    else:
-        world_size = args.n_devices
-    print("Using", world_size, "GPUs...")
-
-    # Create the uid needed for cuGraph comms
-    cugraph_id = cugraph_comms_create_unique_id()
-
-    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
-        mp.spawn(
-            run_train,
-            args=(
-                data,
-                world_size,
-                cugraph_id,
-                model,
-                args.epochs,
-                args.batch_size,
-                args.fan_out,
-                split_idx,
-                dataset.num_classes,
-                wall_clock_start,
-                tempdir,
-                args.num_layers,
-            ),
-            nprocs=world_size,
-            join=True,
+        model = torch_geometric.nn.models.GCN(
+            dataset.num_features,
+            args.hidden_channels,
+            args.num_layers,
+            dataset.num_classes,
         )
+
+        print("Data =", data)
+        if args.n_devices == -1:
+            world_size = torch.cuda.device_count()
+        else:
+            world_size = args.n_devices
+        print("Using", world_size, "GPUs...")
+
+        # Create the uid needed for cuGraph comms
+        cugraph_id = cugraph_comms_create_unique_id()
+
+        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
+            mp.spawn(
+                run_train,
+                args=(
+                    data,
+                    world_size,
+                    cugraph_id,
+                    model,
+                    args.epochs,
+                    args.batch_size,
+                    args.fan_out,
+                    split_idx,
+                    dataset.num_classes,
+                    wall_clock_start,
+                    tempdir,
+                    args.num_layers,
+                ),
+                nprocs=world_size,
+                join=True,
+            )

From 987021dcca359e6a3377ffbd04a384ed180a1ef8 Mon Sep 17 00:00:00 2001
From: Alexandria Barghi <abarghi@nvidia.com>
Date: Wed, 29 May 2024 14:39:10 -0700
Subject: [PATCH 80/80] actually set variable

---
 ci/run_cugraph_pyg_pytests.sh                            | 3 +++
 ci/test_wheel_cugraph-pyg.sh                             | 3 +++
 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh
index 37c33c53446..fb27f16d79e 100755
--- a/ci/run_cugraph_pyg_pytests.sh
+++ b/ci/run_cugraph_pyg_pytests.sh
@@ -8,6 +8,9 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_
 
 pytest --cache-clear --benchmark-disable "$@" .
 
+# Used to skip certain examples in CI due to memory limitations
+export CI_RUN=1
+
 # Test examples
 for e in "$(pwd)"/examples/*.py; do
   rapids-logger "running example $e"
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
index ca91bb0a1a6..1004063cc38 100755
--- a/ci/test_wheel_cugraph-pyg.sh
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -24,6 +24,9 @@ python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
 # RAPIDS_DATASET_ROOT_DIR is used by test scripts
 export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
 
+# Used to skip certain examples in CI due to memory limitations
+export CI_RUN=1
+
 if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then
   PYTORCH_URL="https://download.pytorch.org/whl/cu118"
   PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
index 7d24204d745..b1bb0240e71 100644
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
@@ -256,7 +256,7 @@ def run_train(
 
 
 if __name__ == "__main__":
-    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == 1:
+    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
         warnings.warn("Skipping SMNG example in CI due to memory limit")
     else:
         parser = argparse.ArgumentParser()