diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0f05aedf1a1..865d06b20e4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
         additional_dependencies:
           - flake8==6.0.0
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.1
+    rev: v16.0.6
     hooks:
       - id: clang-format
         exclude: |
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 2fbc6360c04..9e9fcd2faf1 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -26,6 +26,7 @@ requirements:
     - dgl >=1.1.0.cu*
     - numba >=0.57
     - numpy >=1.21
+    - pylibcugraphops ={{ version }}
     - python
     - pytorch
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2eaf4361730..a78c06474c0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -166,6 +166,7 @@ endif()
 
 include(cmake/thirdparty/get_nccl.cmake)
 include(cmake/thirdparty/get_cuhornet.cmake)
+include(cmake/thirdparty/get_ucp.cmake)
 
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
@@ -228,6 +229,7 @@ set(CUGRAPH_SOURCES
     src/sampling/uniform_neighbor_sampling_mg.cpp
     src/sampling/uniform_neighbor_sampling_sg.cpp
     src/sampling/renumber_sampled_edgelist_sg.cu
+    src/sampling/sampling_post_processing_sg.cu
     src/cores/core_number_sg.cu
     src/cores/core_number_mg.cu
     src/cores/k_core_sg.cu
@@ -291,6 +293,7 @@ set(CUGRAPH_SOURCES
     src/community/triangle_count_mg.cu
     src/traversal/k_hop_nbrs_sg.cu
     src/traversal/k_hop_nbrs_mg.cu
+    src/mtmg/vertex_result.cu
 )
 
 if(USE_CUGRAPH_OPS)
diff --git a/cpp/cmake/thirdparty/get_ucp.cmake b/cpp/cmake/thirdparty/get_ucp.cmake
new file mode 100644
index 00000000000..dcc4956a34e
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_ucp.cmake
@@ -0,0 +1,35 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ucp)
+
+    if(TARGET UCP::UCP)
+        return()
+    endif()
+
+    rapids_find_generate_module(UCP
+        HEADER_NAMES ucp.h
+        LIBRARY_NAMES ucp
+        INCLUDE_SUFFIXES ucp/api
+    )
+
+    # Currently UCP has no CMake build-system so we require
+    # it built and installed on the machine already
+    rapids_find_package(UCP REQUIRED)
+
+endfunction()
+
+find_and_configure_ucp()
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
index a15dbf34cf9..faa0fbb841b 100644
--- a/cpp/include/cugraph/detail/utility_wrappers.hpp
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -37,8 +37,8 @@ namespace detail {
  * @param[in]   stream_view  stream view
  * @param[out]  d_value      device array to fill
  * @param[in]   size         number of elements in array
- * @param[in]   min_value    minimum value
- * @param[in]   max_value    maximum value
+ * @param[in]   min_value    minimum value (inclusive)
+ * @param[in]   max_value    maximum value (exclusive)
  * @param[in]   rng_state    The RngState instance holding pseudo-random number generator state.
  *
  */
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 200ee725b7a..5c1e9d5311f 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -919,6 +919,10 @@ rmm::device_uvector<vertex_t> select_random_vertices(
 /**
  * @brief renumber sampling output
  *
+ * @deprecated This API will be deprecated and will be replaced by the
+ * renumber_and_compress_sampled_edgelist and renumber_and_sort_sampled_edgelist functions in
+ * sampling_functions.hpp.
+ *
  * This function renumbers sampling function (e.g. uniform_neighbor_sample) outputs satisfying the
  * following requirements.
  *
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
new file mode 100644
index 00000000000..37398891370
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <raft/core/device_span.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+using device_shared_device_span_t = device_shared_wrapper_t<raft::device_span<T>>;
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
new file mode 100644
index 00000000000..7f3992b73bd
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+class device_shared_device_vector_t : public device_shared_wrapper_t<rmm::device_uvector<T>> {
+  using parent_t = detail::device_shared_wrapper_t<rmm::device_uvector<T>>;
+
+ public:
+  /**
+   * @brief Create a device_shared_device_span (read only view)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    device_shared_device_span_t<T const> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, raft::device_span<T const>{p.second.data(), p.second.size()});
+    });
+
+    return result;
+  }
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
new file mode 100644
index 00000000000..c4cacb401af
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <map>
+#include <mutex>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+class device_shared_wrapper_t {
+ public:
+  using wrapped_t = T;
+
+  device_shared_wrapper_t() = default;
+  device_shared_wrapper_t(device_shared_wrapper_t&& other) : objects_{std::move(other.objects_)} {}
+  device_shared_wrapper_t& operator=(device_shared_wrapper_t&& other)
+  {
+    objects_ = std::move(other.objects_);
+    return *this;
+  }
+
+  /**
+   * @brief Move a wrapped object into the wrapper for this thread
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @param obj     Wrapped object
+   */
+  void set(cugraph::mtmg::handle_t const& handle, wrapped_t&& obj)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+    CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
+
+    objects_.insert(std::make_pair(handle.get_local_rank(), std::move(obj)));
+  }
+
+  /**
+   * @brief Move a wrapped object into the wrapper for this thread
+   *
+   * @param local_rank  Identify which GPU to associated this object with
+   * @param obj         Wrapped object
+   */
+  void set(int local_rank, wrapped_t&& obj)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(local_rank);
+    CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
+
+    objects_.insert(std::make_pair(local_rank, std::move(obj)));
+  }
+
+ public:
+  /**
+   * @brief Get reference to an object for a particular thread
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @return Reference to the wrapped object
+   */
+  wrapped_t& get(cugraph::mtmg::handle_t const& handle)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+    CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
+
+    return pos->second;
+  }
+
+  /**
+   * @brief Get the pointer to an object for a particular thread from this wrapper
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @return Shared pointer the wrapped object
+   */
+  wrapped_t const& get(cugraph::mtmg::handle_t const& handle) const
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+
+    CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
+
+    return pos->second;
+  }
+
+ protected:
+  mutable std::mutex lock_{};
+  std::map<int, wrapped_t> objects_{};
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
new file mode 100644
index 00000000000..8011146ee4f
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+
+// FIXME: Could use std::span once compiler supports C++20
+#include <raft/core/host_span.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief An edgelist for each GPU
+ *
+ * Manages an edge list for edges associated with a particular GPU.  Multiple threads
+ * can call the append() method, possibly concurrently.  To avoid constantly copying
+ * when the buffers fill up, the class will create a device buffer containing a
+ * number of elements specified in the constructor.  When that device buffer is full
+ * we will create a new buffer.
+ *
+ * When we try and use the edgelist we will consolidate the buffers, since at that
+ * time we know the entire size required.
+ *
+ * Important note, the expectation is that this object will be used in two phases:
+ *  1) The append() method will be used to fill buffers with edges
+ *  2) The edges will be consumed to create a graph
+ *
+ * These two phases are expected to be disjoint.  The calling process is expected to
+ * manage some barrier so that all threads are guaranteed to be completed before changing
+ * phases.  If an append() call (part of the filling phase) overlaps with calls to
+ * finalize_buffer(), consolidate_and_shuffle(), get_src(), get_dst(), get_wgt(),
+ * get_edge_id() and get_edge_type() then the behavior is undefined (data might change
+ * in some non-deterministic way).
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class per_device_edgelist_t {
+ public:
+  per_device_edgelist_t()                                        = delete;
+  per_device_edgelist_t(per_device_edgelist_t const&)            = delete;
+  per_device_edgelist_t& operator=(per_device_edgelist_t const&) = delete;
+  per_device_edgelist_t& operator=(per_device_edgelist_t&&)      = delete;
+
+  per_device_edgelist_t(cugraph::mtmg::handle_t const& handle,
+                        size_t device_buffer_size,
+                        bool use_weight,
+                        bool use_edge_id,
+                        bool use_edge_type)
+    : device_buffer_size_{device_buffer_size},
+      current_pos_{0},
+      src_{},
+      dst_{},
+      wgt_{std::nullopt},
+      edge_id_{std::nullopt},
+      edge_type_{std::nullopt}
+  {
+    if (use_weight) { wgt_ = std::make_optional(std::vector<rmm::device_uvector<weight_t>>()); }
+
+    if (use_edge_id) { edge_id_ = std::make_optional(std::vector<rmm::device_uvector<edge_t>>()); }
+
+    if (use_edge_type) {
+      edge_type_ = std::make_optional(std::vector<rmm::device_uvector<edge_type_t>>());
+    }
+
+    create_new_buffers(handle);
+  }
+
+  per_device_edgelist_t(per_device_edgelist_t&& other)
+    : device_buffer_size_{other.device_buffer_size_},
+      current_pos_{other.current_pos_},
+      src_{std::move(other.src_)},
+      dst_{std::move(other.dst_)},
+      wgt_{std::move(other.wgt_)},
+      edge_id_{std::move(other.edge_id_)},
+      edge_type_{std::move(other.edge_type_)}
+  {
+  }
+
+  /**
+   * @brief Append a list of edges to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              raft::host_span<vertex_t const> src,
+              raft::host_span<vertex_t const> dst,
+              std::optional<raft::host_span<weight_t const>> wgt,
+              std::optional<raft::host_span<edge_t const>> edge_id,
+              std::optional<raft::host_span<edge_type_t const>> edge_type)
+  {
+    // FIXME:  This lock guard could be on a smaller region, but it
+    //   would require more careful coding.  The raft::update_device
+    //   calls could be done without the lock if we made a local
+    //   of the values of *.back() and did an increment of current_pos_
+    //   while we hold the lock.
+    std::lock_guard<std::mutex> lock(lock_);
+
+    size_t count = src.size();
+    size_t pos   = 0;
+
+    while (count > 0) {
+      size_t copy_count = std::min(count, (src_.back().size() - current_pos_));
+
+      raft::update_device(
+        src_.back().begin() + current_pos_, src.begin() + pos, copy_count, handle.get_stream());
+      raft::update_device(
+        dst_.back().begin() + current_pos_, dst.begin() + pos, copy_count, handle.get_stream());
+      if (wgt)
+        raft::update_device(
+          wgt_->back().begin() + current_pos_, wgt->begin() + pos, copy_count, handle.get_stream());
+      if (edge_id)
+        raft::update_device(edge_id_->back().begin() + current_pos_,
+                            edge_id->begin() + pos,
+                            copy_count,
+                            handle.get_stream());
+      if (edge_type)
+        raft::update_device(edge_type_->back().begin() + current_pos_,
+                            edge_type->begin() + pos,
+                            copy_count,
+                            handle.get_stream());
+
+      count -= copy_count;
+      pos += copy_count;
+      current_pos_ += copy_count;
+
+      if (current_pos_ == src_.back().size()) { create_new_buffers(handle); }
+    }
+
+    handle.raft_handle().sync_stream();
+  }
+
+  /**
+   * @brief  Mark the edgelist as ready for reading (all writes are complete)
+   *
+   * @param handle     The resource handle
+   */
+  void finalize_buffer(handle_t const& handle)
+  {
+    src_.back().resize(current_pos_, handle.get_stream());
+    dst_.back().resize(current_pos_, handle.get_stream());
+    if (wgt_) wgt_->back().resize(current_pos_, handle.get_stream());
+    if (edge_id_) edge_id_->back().resize(current_pos_, handle.get_stream());
+    if (edge_type_) edge_type_->back().resize(current_pos_, handle.get_stream());
+  }
+
+  bool use_weight() const { return wgt_.has_value(); }
+
+  bool use_edge_id() const { return edge_id_.has_value(); }
+
+  bool use_edge_type() const { return edge_type_.has_value(); }
+
+  std::vector<rmm::device_uvector<vertex_t>>& get_src() { return src_; }
+  std::vector<rmm::device_uvector<vertex_t>>& get_dst() { return dst_; }
+  std::optional<std::vector<rmm::device_uvector<weight_t>>>& get_wgt() { return wgt_; }
+  std::optional<std::vector<rmm::device_uvector<edge_t>>>& get_edge_id() { return edge_id_; }
+  std::optional<std::vector<rmm::device_uvector<edge_type_t>>>& get_edge_type()
+  {
+    return edge_type_;
+  }
+
+  /**
+   * @brief Consolidate edgelists (if necessary) and shuffle to the proper GPU
+   *
+   * @param handle    The resource handle
+   */
+  void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed)
+  {
+    if (src_.size() > 1) {
+      size_t total_size = std::transform_reduce(
+        src_.begin(), src_.end(), size_t{0}, std::plus<size_t>(), [](auto& d_vector) {
+          return d_vector.size();
+        });
+
+      resize_and_copy_buffers(handle.get_stream(), src_, total_size);
+      resize_and_copy_buffers(handle.get_stream(), dst_, total_size);
+      if (wgt_) resize_and_copy_buffers(handle.get_stream(), *wgt_, total_size);
+      if (edge_id_) resize_and_copy_buffers(handle.get_stream(), *edge_id_, total_size);
+      if (edge_type_) resize_and_copy_buffers(handle.get_stream(), *edge_type_, total_size);
+    }
+
+    auto tmp_wgt     = wgt_ ? std::make_optional(std::move((*wgt_)[0])) : std::nullopt;
+    auto tmp_edge_id = edge_id_ ? std::make_optional(std::move((*edge_id_)[0])) : std::nullopt;
+    auto tmp_edge_type =
+      edge_type_ ? std::make_optional(std::move((*edge_type_)[0])) : std::nullopt;
+
+    std::tie(store_transposed ? dst_[0] : src_[0],
+             store_transposed ? src_[0] : dst_[0],
+             tmp_wgt,
+             tmp_edge_id,
+             tmp_edge_type) =
+      cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+        handle.raft_handle(),
+        store_transposed ? std::move(dst_[0]) : std::move(src_[0]),
+        store_transposed ? std::move(src_[0]) : std::move(dst_[0]),
+        std::move(tmp_wgt),
+        std::move(tmp_edge_id),
+        std::move(tmp_edge_type));
+
+    if (tmp_wgt) ((*wgt_)[0]) = std::move(*tmp_wgt);
+    if (tmp_edge_id) ((*edge_id_)[0]) = std::move(*tmp_edge_id);
+    if (tmp_edge_type) ((*edge_type_)[0]) = std::move(*tmp_edge_type);
+  }
+
+ private:
+  template <typename T>
+  void resize_and_copy_buffers(rmm::cuda_stream_view stream,
+                               std::vector<rmm::device_uvector<T>>& buffer,
+                               size_t total_size)
+  {
+    size_t pos = buffer[0].size();
+    buffer[0].resize(total_size, stream);
+
+    for (size_t i = 1; i < buffer.size(); ++i) {
+      raft::copy(buffer[0].data() + pos, buffer[i].data(), buffer[i].size(), stream);
+      pos += buffer[i].size();
+      buffer[i].resize(0, stream);
+      buffer[i].shrink_to_fit(stream);
+    }
+
+    std::vector<rmm::device_uvector<T>> new_buffer;
+    new_buffer.push_back(std::move(buffer[0]));
+    buffer = std::move(new_buffer);
+  }
+
+  void create_new_buffers(cugraph::mtmg::handle_t const& handle)
+  {
+    src_.emplace_back(device_buffer_size_, handle.get_stream());
+    dst_.emplace_back(device_buffer_size_, handle.get_stream());
+
+    if (wgt_) { wgt_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    if (edge_id_) { edge_id_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    if (edge_type_) { edge_type_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    current_pos_ = 0;
+  }
+
+  mutable std::mutex lock_{};
+
+  size_t current_pos_{0};
+  size_t device_buffer_size_{0};
+
+  std::vector<rmm::device_uvector<vertex_t>> src_{};
+  std::vector<rmm::device_uvector<vertex_t>> dst_{};
+  std::optional<std::vector<rmm::device_uvector<weight_t>>> wgt_{};
+  std::optional<std::vector<rmm::device_uvector<edge_t>>> edge_id_{};
+  std::optional<std::vector<rmm::device_uvector<edge_type_t>>> edge_type_{};
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edge_property.hpp b/cpp/include/cugraph/mtmg/edge_property.hpp
new file mode 100644
index 00000000000..afa72492b9a
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edge_property.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/edge_property_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edge property object for each GPU
+ */
+template <typename graph_view_t, typename property_t>
+class edge_property_t : public detail::device_shared_wrapper_t<
+                          cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>> {
+ public:
+  using parent_t = detail::device_shared_wrapper_t<
+    cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>>;
+
+  /**
+   * @brief Return a edge_property_view_t (read only)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    using edge_t = typename graph_view_t::wrapped_t::edge_type;
+    using buffer_t =
+      typename cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>::buffer_type;
+    std::vector<buffer_t> buffers{};
+    using const_value_iterator_t = decltype(get_dataframe_buffer_cbegin(buffers[0]));
+
+    edge_property_view_t<edge_t, const_value_iterator_t> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, p.second.view());
+    });
+
+    return result;
+  }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edge_property_view.hpp b/cpp/include/cugraph/mtmg/edge_property_view.hpp
new file mode 100644
index 00000000000..c84a6458e1d
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edge_property_view.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edge property object for each GPU
+ */
+template <typename edge_t, typename value_iterator_t>
+using edge_property_view_t =
+  detail::device_shared_wrapper_t<cugraph::edge_property_view_t<edge_t, value_iterator_t>>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edgelist.hpp b/cpp/include/cugraph/mtmg/edgelist.hpp
new file mode 100644
index 00000000000..90c53dfbb64
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edgelist.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/detail/per_device_edgelist.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edgelist object for each GPU
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class edgelist_t : public detail::device_shared_wrapper_t<
+                     detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>> {
+ public:
+  /**
+   * @brief Create a per_device_edgelist for this GPU
+   */
+  void set(handle_t const& handle,
+           size_t device_buffer_size,
+           bool use_weight,
+           bool use_edge_id,
+           bool use_edge_type)
+  {
+    detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> tmp(
+      handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+
+    detail::device_shared_wrapper_t<
+      detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>>::set(handle,
+                                                                                   std::move(tmp));
+  }
+
+  /**
+   * @brief Stop inserting edges into this edgelist so we can use the edges
+   */
+  void finalize_buffer(handle_t const& handle) { this->get(handle).finalize_buffer(handle); }
+
+  /**
+   * @brief Consolidate for the edgelist edges into a single edgelist and then
+   *        shuffle across GPUs.
+   */
+  void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed)
+  {
+    this->get(handle).consolidate_and_shuffle(handle, store_transposed);
+  }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/graph.hpp b/cpp/include/cugraph/mtmg/graph.hpp
new file mode 100644
index 00000000000..76a2f401425
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/graph.hpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/edge_property.hpp>
+#include <cugraph/mtmg/graph_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Graph object for each GPU
+ */
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+class graph_t : public detail::device_shared_wrapper_t<
+                  cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>> {
+  using parent_t = detail::device_shared_wrapper_t<
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+
+ public:
+  /**
+   * @brief Create an MTMG graph view (read only)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, std::move(p.second.view()));
+    });
+
+    return result;
+  }
+};
+
+/**
+ * @brief Create an MTMG graph from an edgelist
+ *
+ * @param[in]  handle             Resource handle
+ * @param[in]  edgelist           Edgelist
+ * @param[in]  graph_properties   Graph properties
+ * @param[in]  renumber           If true, renumber graph (must be true for MG)
+ * @param[out] graph              MTMG graph is stored here
+ * @param[out] edge_weights       MTMG edge weights is stored here
+ * @param[out] edge_ids           MTMG edge ids is stored here
+ * @param[out] edge_types         MTMG edge types is stored here
+ * @param[in]  renumber_map       MTMG renumber_map is stored here
+ * @param[in]  do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+void create_graph_from_edgelist(
+  handle_t const& handle,
+  cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_id_t, edge_type_t>& edgelist,
+  graph_properties_t graph_properties,
+  bool renumber,
+  cugraph::mtmg::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>& graph,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    weight_t>>& edge_weights,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    edge_id_t>>& edge_ids,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    edge_type_t>>& edge_types,
+  std::optional<cugraph::mtmg::renumber_map_t<vertex_t>>& renumber_map,
+  bool do_expensive_check = false)
+{
+  if (handle.get_thread_rank() > 0) return;
+
+  CUGRAPH_EXPECTS(renumber_map.has_value() == renumber,
+                  "Renumbering set to true, but no space for renumber map");
+
+  auto& my_edgelist = edgelist.get(handle);
+
+  CUGRAPH_EXPECTS(my_edgelist.get_src().size() > 0, "Cannot create graph without an edge list");
+  CUGRAPH_EXPECTS(my_edgelist.get_src().size() == 1,
+                  "Must consolidate edges into a single list before creating graph");
+
+  auto [local_graph, local_edge_weights, local_edge_ids, local_edge_types, local_renumber_map] =
+    cugraph::create_graph_from_edgelist<vertex_t,
+                                        edge_t,
+                                        weight_t,
+                                        edge_id_t,
+                                        edge_type_t,
+                                        store_transposed,
+                                        multi_gpu>(
+      handle.raft_handle(),
+      std::nullopt,
+      std::move(my_edgelist.get_src()[0]),
+      std::move(my_edgelist.get_dst()[0]),
+      my_edgelist.get_wgt() ? std::make_optional(std::move((*my_edgelist.get_wgt())[0]))
+                            : std::nullopt,
+      my_edgelist.get_edge_id() ? std::make_optional(std::move((*my_edgelist.get_edge_id())[0]))
+                                : std::nullopt,
+      my_edgelist.get_edge_type() ? std::make_optional(std::move((*my_edgelist.get_edge_type())[0]))
+                                  : std::nullopt,
+      graph_properties,
+      renumber,
+      do_expensive_check);
+
+  graph.set(handle, std::move(local_graph));
+  if (edge_weights) edge_weights->set(handle, std::move(*local_edge_weights));
+  if (edge_ids) edge_ids->set(handle, std::move(*local_edge_ids));
+  if (edge_types) edge_types->set(handle, std::move(*local_edge_types));
+  if (renumber) renumber_map->set(handle, std::move(*local_renumber_map));
+}
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/graph_view.hpp b/cpp/include/cugraph/mtmg/graph_view.hpp
new file mode 100644
index 00000000000..94347e016ea
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/graph_view.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Graph view for each GPU
+ */
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+using graph_view_t = detail::device_shared_wrapper_t<
+  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/handle.hpp b/cpp/include/cugraph/mtmg/handle.hpp
new file mode 100644
index 00000000000..f23bce5aeac
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/handle.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Resource handler
+ *
+ * Multi-threaded resource handler.  Every GPU gets a raft::handle object that provides access to
+ * the GPU resources.  In a multi-threaded environment multiple threads will share a particular GPU.
+ * Following the MPI model, each thread will be assigned to a thread rank.
+ *
+ */
+class handle_t {
+ public:
+  /**
+   * @brief Constructor
+   *
+   * @param raft_handle   Raft handle for the resources
+   * @param thread_rank   Rank for this thread
+   */
+  handle_t(raft::handle_t const& raft_handle, int thread_rank, size_t device_id)
+    : raft_handle_(raft_handle),
+      thread_rank_(thread_rank),
+      local_rank_(raft_handle.get_comms().get_rank()),  // FIXME: update for multi-node
+      device_id_(device_id)
+  {
+  }
+
+  /**
+   * @brief Get the raft handle
+   *
+   * @return const reference to a raft handle
+   */
+  raft::handle_t const& raft_handle() const { return raft_handle_; }
+
+  /**
+   * @brief Get cuda stream
+   *
+   * @return cuda stream
+   */
+  rmm::cuda_stream_view get_stream() const
+  {
+    return raft_handle_.is_stream_pool_initialized()
+             ? raft_handle_.get_stream_from_stream_pool(device_id_)
+             : raft_handle_.get_stream();
+  }
+
+  /**
+   * @brief Get thread rank
+   *
+   * @return thread rank
+   */
+  int get_thread_rank() const { return thread_rank_; }
+
+  /**
+   * @brief Get number of gpus
+   *
+   * @return number of gpus
+   */
+  int get_size() const { return raft_handle_.get_comms().get_size(); }
+
+  /**
+   * @brief Get number of local gpus
+   *
+   * @return number of local gpus
+   */
+  // FIXME: wrong for multi-node
+  int get_local_size() const { return raft_handle_.get_comms().get_size(); }
+
+  /**
+   * @brief Get gpu rank
+   *
+   * @return gpu rank
+   */
+  int get_rank() const { return raft_handle_.get_comms().get_rank(); }
+
+  /**
+   * @brief Get local gpu rank
+   *
+   * @return local gpu rank
+   */
+  int get_local_rank() const { return local_rank_; }
+
+ private:
+  raft::handle_t const& raft_handle_;
+  int thread_rank_;
+  int local_rank_;
+  size_t device_id_;
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/instance_manager.hpp b/cpp/include/cugraph/mtmg/instance_manager.hpp
new file mode 100644
index 00000000000..8bf62b56f4b
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/instance_manager.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+
+#include <nccl.h>
+
+#include <vector>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Manages a subset of the cluster for a set of graph computations
+ */
+class instance_manager_t {
+ public:
+  /**
+   * @brief Constructor
+   *
+   * @param handles   Vector of RAFT handles, one for each device on this node
+   */
+  instance_manager_t(std::vector<std::unique_ptr<raft::handle_t>>&& handles,
+                     std::vector<std::unique_ptr<ncclComm_t>>&& nccl_comms,
+                     std::vector<rmm::cuda_device_id>&& device_ids,
+                     int local_gpu_count)
+    : thread_counter_{0},
+      raft_handle_{std::move(handles)},
+      nccl_comms_{std::move(nccl_comms)},
+      device_ids_{std::move(device_ids)},
+      local_gpu_count_{local_gpu_count}
+  {
+  }
+
+  /**
+   * @brief Get handle
+   *
+   * The instance manager will construct a handle appropriate for the thread making
+   * the request.  Threads will be assigned to GPUs in a round-robin fashion to
+   * spread requesting threads around the GPU resources.
+   *
+   * This function will be CPU thread-safe.
+   *
+   * @return a handle for this thread.
+   */
+  handle_t get_handle()
+  {
+    int local_id = thread_counter_++;
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_ids_[local_id % raft_handle_.size()].value()));
+    return handle_t(*raft_handle_[local_id % raft_handle_.size()],
+                    local_id / raft_handle_.size(),
+                    static_cast<size_t>(local_id % raft_handle_.size()));
+  }
+
+  /**
+   * @brief Reset the thread counter
+   *
+   * After a parallel activity is completed, we need to reset the thread counter so that
+   * future threads will round robin around the GPUs properly.
+   */
+  void reset_threads() { thread_counter_.store(0); }
+
+  /**
+   * @brief Number of local GPUs in the instance
+   */
+  int get_local_gpu_count() { return local_gpu_count_; }
+
+ private:
+  // FIXME: Should this be an std::map<> where the key is the rank?
+  //        On a multi-node system we might have nodes with fewer
+  //        (or no) GPUs, so mapping rank to a handle might be a challenge
+  //
+  std::vector<std::unique_ptr<raft::handle_t>> raft_handle_{};
+  std::vector<std::unique_ptr<ncclComm_t>> nccl_comms_{};
+  std::vector<rmm::cuda_device_id> device_ids_{};
+  int local_gpu_count_{};
+
+  std::atomic<int> thread_counter_{0};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
new file mode 100644
index 00000000000..b672db48719
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/detail/per_device_edgelist.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Supports creating an edgelist from individual host threads
+ *
+ * A cugraph edgelist needs to contain all of the edges necessary to create the graph
+ * stored in GPU memory (distributed across multiple GPUs in a multi-GPU configuration).
+ *
+ * This class provides a mechanism for populating the edgelist object from independent CPU threads.
+ *
+ * Calls to the append() method will take edges (in CPU host memory) and append them to a local
+ * buffer.  As the local buffer fills, the buffer will be sent to GPU memory using the flush()
+ * method.  This allows the CPU to GPU transfers to be larger (and consequently more efficient).
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class per_thread_edgelist_t {
+ public:
+  per_thread_edgelist_t()                             = delete;
+  per_thread_edgelist_t(per_thread_edgelist_t const&) = delete;
+
+  /**
+   * @brief Only constructor
+   *
+   * @param edgelist            The edge list this thread_edgelist_t should be associated with
+   * @param thread_buffer_size  Size of the local buffer for accumulating edges on the CPU
+   */
+  per_thread_edgelist_t(
+    detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>& edgelist,
+    size_t thread_buffer_size)
+    : edgelist_{edgelist},
+      current_pos_{0},
+      src_(thread_buffer_size),
+      dst_(thread_buffer_size),
+      wgt_{std::nullopt},
+      edge_id_{std::nullopt},
+      edge_type_{std::nullopt}
+  {
+    if (edgelist.use_weight()) wgt_ = std::make_optional(std::vector<weight_t>(thread_buffer_size));
+
+    if (edgelist.use_edge_id())
+      edge_id_ = std::make_optional(std::vector<edge_t>(thread_buffer_size));
+
+    if (edgelist.use_edge_type())
+      edge_type_ = std::make_optional(std::vector<edge_type_t>(thread_buffer_size));
+  }
+
+  /**
+   * @brief Append an edge to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              vertex_t src,
+              vertex_t dst,
+              std::optional<weight_t> wgt,
+              std::optional<edge_t> edge_id,
+              std::optional<edge_type_t> edge_type)
+  {
+    if (current_pos_ == src_.size()) { flush(handle); }
+
+    src_[current_pos_] = src;
+    dst_[current_pos_] = dst;
+    if (wgt) (*wgt_)[current_pos_] = *wgt;
+    if (edge_id) (*edge_id_)[current_pos_] = *edge_id;
+    if (edge_type) (*edge_type_)[current_pos_] = *edge_type;
+
+    ++current_pos_;
+  }
+
+  /**
+   * @brief Append a list of edges to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              raft::host_span<vertex_t const> src,
+              raft::host_span<vertex_t const> dst,
+              std::optional<raft::host_span<weight_t const>> wgt,
+              std::optional<raft::host_span<edge_t const>> edge_id,
+              std::optional<raft::host_span<edge_type_t const>> edge_type)
+  {
+    size_t count = src.size();
+    size_t pos   = 0;
+
+    while (count > 0) {
+      size_t copy_count = std::min(count, (src_.size() - current_pos_));
+
+      std::copy(src.begin() + pos, src.begin() + pos + copy_count, src_.begin() + current_pos_);
+      std::copy(dst.begin() + pos, dst.begin() + pos + copy_count, dst_.begin() + current_pos_);
+      if (wgt)
+        std::copy(wgt.begin() + pos, wgt.begin() + pos + copy_count, wgt_->begin() + current_pos_);
+      if (edge_id)
+        std::copy(edge_id.begin() + pos,
+                  edge_id.begin() + pos + copy_count,
+                  edge_id_->begin() + current_pos_);
+      if (edge_type)
+        std::copy(edge_type.begin() + pos,
+                  edge_type.begin() + pos + copy_count,
+                  edge_type_->begin() + current_pos_);
+
+      if (current_pos_ == src_.size()) { flush(handle); }
+
+      count -= copy_count;
+      pos += copy_count;
+    }
+  }
+
+  /**
+   * @brief Flush thread data from host to GPU memory
+   *
+   * @param handle     The resource handle
+   */
+  void flush(handle_t const& handle)
+  {
+    edgelist_.append(
+      handle,
+      raft::host_span<vertex_t const>{src_.data(), current_pos_},
+      raft::host_span<vertex_t const>{dst_.data(), current_pos_},
+      wgt_ ? std::make_optional(raft::host_span<weight_t const>{wgt_->data(), current_pos_})
+           : std::nullopt,
+      edge_id_ ? std::make_optional(raft::host_span<edge_t const>{edge_id_->data(), current_pos_})
+               : std::nullopt,
+      edge_type_
+        ? std::make_optional(raft::host_span<edge_type_t const>{edge_type_->data(), current_pos_})
+        : std::nullopt);
+
+    current_pos_ = 0;
+  }
+
+ private:
+  detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>& edgelist_;
+  size_t current_pos_{0};
+  std::vector<vertex_t> src_{};
+  std::vector<vertex_t> dst_{};
+  std::optional<std::vector<weight_t>> wgt_{};
+  std::optional<std::vector<edge_t>> edge_id_{};
+  std::optional<std::vector<edge_type_t>> edge_type_{};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/renumber_map.hpp b/cpp/include/cugraph/mtmg/renumber_map.hpp
new file mode 100644
index 00000000000..da07d61bd96
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/renumber_map.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_vector.hpp>
+#include <cugraph/mtmg/renumber_map_view.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device vector for storing a renumber map
+ */
+template <typename vertex_t>
+class renumber_map_t : public detail::device_shared_device_vector_t<vertex_t> {
+  using parent_t = detail::device_shared_device_vector_t<vertex_t>;
+
+ public:
+  /**
+   * @brief Return a view (read only) of the renumber map
+   */
+  auto view() { return static_cast<renumber_map_view_t<vertex_t>>(this->parent_t::view()); }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/renumber_map_view.hpp b/cpp/include/cugraph/mtmg/renumber_map_view.hpp
new file mode 100644
index 00000000000..5ff7ff5e100
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/renumber_map_view.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device span for storing a renumber map
+ */
+template <typename vertex_t>
+using renumber_map_view_t = detail::device_shared_device_span_t<vertex_t const>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/resource_manager.hpp b/cpp/include/cugraph/mtmg/resource_manager.hpp
new file mode 100644
index 00000000000..b4633626e7c
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/resource_manager.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/instance_manager.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/std_comms.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <execution>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Class for managing local and remote GPU resources for use in
+ *   multi-threaded multi-GPU interface.
+ *
+ * Each process in a multi-GPU configuration should have an instance of this
+ * class.  The resource manager object should be configured by calling
+ * register_local_gpu (or register_remote_gpu once we support a multi-node
+ * configuration) to allocate resources that can be used in the mtmg space.
+ *
+ * When we want to execute some graph computations, we need to create an instance for execution.
+ * Based on how big a subset of the desired compute resources is desired, we can allocate some
+ * number of GPUs to the problem (up to the total set of managed resources).
+ *
+ * The returned instance can be used to create a graph, execute one or more algorithms, etc.  Once
+ * we are done the caller can delete the instance.
+ *
+ * At the moment, the caller is assumed to be responsible for scheduling use of the resources.
+ *
+ * For our first release, we will only consider a single node multi-GPU configuration, so the remote
+ * GPU methods are currently disabled via ifdef.
+ */
+class resource_manager_t {
+ public:
+  /**
+   * @brief Default constructor
+   */
+  resource_manager_t() {}
+
+  /**
+   * @brief add a local GPU to the resource manager.
+   *
+   * @param rank       The rank to assign to the local GPU
+   * @param device_id  The device_id corresponding to this rank
+   */
+  void register_local_gpu(int rank, rmm::cuda_device_id device_id)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    CUGRAPH_EXPECTS(local_rank_map_.find(rank) == local_rank_map_.end(),
+                    "cannot register same rank multiple times");
+
+    int num_gpus_this_node;
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_this_node));
+
+    CUGRAPH_EXPECTS((device_id.value() >= 0) && (device_id.value() < num_gpus_this_node),
+                    "device id out of range");
+
+    local_rank_map_.insert(std::pair(rank, device_id));
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_id.value()));
+
+    // FIXME: There is a bug in the cuda_memory_resource that results in a Hang.
+    //   using the pool resource as a work-around.
+    //
+    // There is a deprecated environment variable: NCCL_LAUNCH_MODE=GROUP
+    // which should temporarily work around this problem.
+    //
+    // Ultimately there should be some RMM parameters passed into this function
+    // (or the constructor of the object) to configure this behavior
+#if 0
+    auto per_device_it = per_device_rmm_resources_.insert(
+      std::pair{rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
+#else
+    auto const [free, total] = rmm::detail::available_device_memory();
+    auto const min_alloc =
+      rmm::detail::align_down(std::min(free, total / 6), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+
+    auto per_device_it = per_device_rmm_resources_.insert(
+      std::pair{rank,
+                rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+                  std::make_shared<rmm::mr::cuda_memory_resource>(), min_alloc)});
+#endif
+
+    rmm::mr::set_per_device_resource(device_id, per_device_it.first->second.get());
+  }
+
+  /**
+   * @brief Create an instance using a subset of the registered resources
+   *
+   * The selected set of resources will be configured as an instance manager.
+   * If @ranks_to_include is a proper subset of the registered resources,
+   * ranks will be renumbered into the range [0, @p ranks_to_use.size()), making
+   * it a proper configuration.
+   *
+   * @param ranks_to_use        a vector containing the ranks to include in the instance.
+   *   Must be a subset of the entire set of available ranks.
+   * @param instance_manager_id a ncclUniqueId that is shared by all processes participating
+   *   in this instance.  All processes must use the same ID in this call, it is up
+   *   to the calling code to share this ID properly before the call.
+   *
+   * @return unique pointer to instance manager
+   */
+  std::unique_ptr<instance_manager_t> create_instance_manager(
+    std::vector<int> ranks_to_include, ncclUniqueId instance_manager_id) const
+  {
+    std::for_each(
+      ranks_to_include.begin(), ranks_to_include.end(), [local_ranks = local_rank_map_](int rank) {
+        CUGRAPH_EXPECTS(local_ranks.find(rank) != local_ranks.end(),
+                        "requesting inclusion of an invalid rank");
+      });
+
+    std::vector<std::unique_ptr<ncclComm_t>> nccl_comms{};
+    std::vector<std::unique_ptr<raft::handle_t>> handles{};
+    std::vector<rmm::cuda_device_id> device_ids{};
+
+    nccl_comms.reserve(ranks_to_include.size());
+    handles.reserve(ranks_to_include.size());
+    device_ids.reserve(ranks_to_include.size());
+
+    // FIXME: not quite right for multi-node
+    auto gpu_row_comm_size = static_cast<int>(sqrt(static_cast<double>(ranks_to_include.size())));
+    while (ranks_to_include.size() % gpu_row_comm_size != 0) {
+      --gpu_row_comm_size;
+    }
+
+    // FIXME: not quite right for multi-node
+    for (size_t i = 0; i < ranks_to_include.size(); ++i) {
+      int rank = ranks_to_include[i];
+      auto pos = local_rank_map_.find(rank);
+      RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+
+      raft::handle_t tmp_handle;
+
+      nccl_comms.push_back(std::make_unique<ncclComm_t>());
+      handles.push_back(
+        std::make_unique<raft::handle_t>(tmp_handle, per_device_rmm_resources_.find(rank)->second));
+      device_ids.push_back(pos->second);
+    }
+
+    std::vector<std::thread> running_threads;
+
+    for (size_t i = 0; i < ranks_to_include.size(); ++i) {
+      running_threads.emplace_back([instance_manager_id,
+                                    idx = i,
+                                    gpu_row_comm_size,
+                                    comm_size = ranks_to_include.size(),
+                                    &ranks_to_include,
+                                    &local_rank_map = local_rank_map_,
+                                    &nccl_comms,
+                                    &handles]() {
+        int rank = ranks_to_include[idx];
+        auto pos = local_rank_map.find(rank);
+        RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+
+        NCCL_TRY(ncclCommInitRank(nccl_comms[idx].get(), comm_size, instance_manager_id, rank));
+
+        raft::comms::build_comms_nccl_only(handles[idx].get(), *nccl_comms[idx], comm_size, rank);
+
+        cugraph::partition_manager::init_subcomm(*handles[idx], gpu_row_comm_size);
+      });
+    }
+
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+
+    // FIXME: Update for multi-node
+    return std::make_unique<instance_manager_t>(
+      std::move(handles), std::move(nccl_comms), std::move(device_ids), ranks_to_include.size());
+  }
+
+  /**
+   * @brief Get a list of all of the currently registered ranks
+   *
+   * @return A copy of the list of ranks.
+   */
+  std::vector<int> registered_ranks() const
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    //
+    // C++20 mechanism:
+    // return std::vector<int>{ std::views::keys(local_rank_map_).begin(),
+    //                          std::views::keys(local_rank_map_).end() };
+    //  Would need a bit more complicated to handle remote_rank_map_ also
+    //
+    std::vector<int> registered_ranks(local_rank_map_.size());
+    std::transform(
+      local_rank_map_.begin(), local_rank_map_.end(), registered_ranks.begin(), [](auto pair) {
+        return pair.first;
+      });
+
+    return registered_ranks;
+  }
+
+ private:
+  mutable std::mutex lock_{};
+  std::map<int, rmm::cuda_device_id> local_rank_map_{};
+  std::map<int, std::shared_ptr<rmm::mr::device_memory_resource>> per_device_rmm_resources_{};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_result.hpp b/cpp/include/cugraph/mtmg/vertex_result.hpp
new file mode 100644
index 00000000000..e8999b35aa9
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_result.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_vector.hpp>
+#include <cugraph/mtmg/vertex_result_view.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device vector for storing vertex results
+ */
+template <typename result_t>
+class vertex_result_t : public detail::device_shared_device_vector_t<result_t> {
+  using parent_t = detail::device_shared_device_vector_t<result_t>;
+
+ public:
+  /**
+   * @brief Create a vertex result view (read only)
+   */
+  auto view() { return static_cast<vertex_result_view_t<result_t>>(this->parent_t::view()); }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_result_view.hpp b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
new file mode 100644
index 00000000000..7a7070d6f2a
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <cugraph/mtmg/graph_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device span for referencing a vertex result
+ */
+template <typename result_t>
+class vertex_result_view_t : public detail::device_shared_device_span_t<result_t const> {
+  using parent_t = detail::device_shared_device_span_t<result_t const>;
+
+ public:
+  vertex_result_view_t(parent_t&& other) : parent_t{std::move(other)} {}
+
+  /**
+   * @brief Gather results from specified vertices into a device vector
+   */
+  template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+  rmm::device_uvector<result_t> gather(
+    handle_t const& handle,
+    raft::device_span<vertex_t const> vertices,
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view);
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
new file mode 100644
index 00000000000..e42ef9bfcf3
--- /dev/null
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <optional>
+#include <tuple>
+
+namespace cugraph {
+
+/*
+ * @brief renumber sampled edge list and compress to the (D)CSR|(D)CSC format.
+ *
+ * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
+ * the following requirements. Assume major = source if @p src_is_major is true, major = destination
+ * if @p src_is_major is false.
+ *
+ * 1. If @p edgelist_hops is valid, we can consider (vertex ID, hop, flag=major) triplets for each
+ * vertex ID in edge majors (@p edgelist_srcs if @p src_is_major is true, @p edgelist_dsts if false)
+ * and (vertex ID, hop, flag=minor) triplets for each vertex ID in edge minors. From these triplets,
+ * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
+ * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
+ * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
+ * appear only in edge minors.
+ * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
+ * renumbered separately.
+ *
+ * The renumbered edges are compressed based on the following requirements.
+ *
+ * 1. If @p compress_per_hop is true, edges are compressed separately for each hop. If @p
+ * compress_per_hop is false, edges with different hop numbers are compressed altogether.
+ * 2. Edges are compressed independently for different labels.
+ * 3. If @p doubly_compress is false, edges are compressed to CSR (if @p src_is_major is true) or
+ * CSC (if @p src_is_major is false). If @p doubly_compress is true, edges are compressed to DCSR
+ * (if @p src_is_major is true) or DCSC (if @p src_is_major is false). If @p doubly_compress is
+ * false, the CSR/CSC offset array size is the number of vertices (which is the maximum vertex ID +
+ * 1) + 1. Here, the maximum vertex ID is the maximum major vertex ID in the edges to compress if @p
+ * compress_per_hop is false or for hop 0. If @p compress_per_hop is true and hop number is 1 or
+ * larger, the maximum vertex ID is the larger of the maximum major vertex ID for this hop and the
+ * maximum vertex ID for the edges in the previous hops.
+ *
+ * If both @p compress_per_hop is false and @p edgelist_hops.has_value() is true, majors should be
+ * non-decreasing within each label after renumbering and sorting by (hop, major, minor). Also,
+ * majors in hop N should not appear in any of the previous hops. This condition is satisfied if
+ * majors in hop N + 1 does not have any vertices from the previous hops excluding the minors from
+ * hop N.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and compression.
+ * @param compress_per_hop A flag to determine whether to compress edges with different hop numbers
+ * separately (if true) or altogether (if false). If @p compress_per_hop is true, @p
+ * edgelist_hops.has_value() should be true and @p doubly_compress should be false.
+ * @param doubly_compress A flag to determine whether to compress to the CSR/CSC format (if false)
+ * or the DCSR/DCSC format (if true).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing optional DCSR/DCSC major vertex IDs with one or more neighbors,
+ * (D)CSR|(D)CSC offset values, edge minor vertex IDs, optional edge weights (valid only if @p
+ * edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the
+ * (D)CSR|(D)CSC offset array (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is rue), renumber_map to query original vertices (size = # unique
+ * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
+ * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * edgelist_label_offsets.has_value() is true).
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<std::optional<rmm::device_uvector<vertex_t>>,     // dcsr/dcsc major vertices
+           rmm::device_uvector<size_t>,                      // (d)csr/(d)csc offset values
+           rmm::device_uvector<vertex_t>,                    // minor vertices
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,  // (label, hop) offsets to the (d)csr/(d)csc
+                                                        // offset array
+           rmm::device_uvector<vertex_t>,               // renumber map
+           std::optional<rmm::device_uvector<size_t>>>  // label offsets to the renumber map
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool compress_per_hop   = false,
+  bool doubly_compress    = false,
+  bool do_expensive_check = false);
+
+/*
+ * @brief renumber sampled edge list and sort the renumbered edges.
+ *
+ * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
+ * the following requirements. Assume major = source if @p src_is_major is true, major = destination
+ * if @p src_is_major is false.
+ *
+ * 1. If @p edgelist_hops is valid, we can consider (vertex ID, hop, flag=major) triplets for each
+ * vertex ID in edge majors (@p edgelist_srcs if @p src_is_major is true, @p edgelist_dsts if false)
+ * and (vertex ID, hop, flag=minor) triplets for each vertex ID in edge minors. From these triplets,
+ * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
+ * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
+ * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
+ * appear only in edge minors.
+ * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
+ * renumbered separately.
+ *
+ * The renumbered edges are sorted based on the following rules.
+ *
+ * 1. If @p src_is_major is true, use ((hop), src, dst) as the key in sorting. If @p src_is_major is
+ * false, use ((hop), dst, src) instead. hop is used only if @p edgelist_hops.has_value() is true.
+ * 2. Edges in each label are sorted independently if @p edgelist_label_offsets.has_value() is true.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
+ * non-decreasing within each label.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and sorting.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
+ * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the renumbered
+ * and sorted edges (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is true), renumber_map to query original vertices (size = # unique
+ * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
+ * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * edgelist_label_offsets.has_value() is true).
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,       // (label, hop) offsets to the edges
+           rmm::device_uvector<vertex_t>,                    // renumber map
+           std::optional<rmm::device_uvector<size_t>>>       // label offsets to the renumber map
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool do_expensive_check = false);
+
+/*
+ * @brief sort sampled edge list.
+ *
+ * Sampled edges are sorted based on the following rules.
+ *
+ * 1. If @p src_is_major is true, use ((hop), src, dst) as the key in sorting. If @p src_is_major is
+ * false, use ((hop), dst, src) instead. hop is used only if @p edgelist_hops.has_value() is true.
+ * 2. Edges in each label are sorted independently if @p edgelist_label_offsets.has_value() is true.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
+ * non-decreasing within each label.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and sorting.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
+ * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), and optional (label, hop) offset values to the
+ * renumbered and sorted edges (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is true)
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool do_expensive_check = false);
+
+}  // namespace cugraph
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index caaba8e9c8d..f146c331d8c 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -25,7 +25,7 @@
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
 
 #include <raft/core/handle.hpp>
 
diff --git a/cpp/src/components/legacy/scc_matrix.cuh b/cpp/src/components/legacy/scc_matrix.cuh
index 3d56bdc5bf4..d044123bed0 100644
--- a/cpp/src/components/legacy/scc_matrix.cuh
+++ b/cpp/src/components/legacy/scc_matrix.cuh
@@ -68,7 +68,7 @@ struct SCC_Data {
   SCC_Data(size_t nrows,
            const IndexT* p_d_r_o,  // row_offsets
            const IndexT* p_d_c_i)
-    :                              // column indices
+    :  // column indices
       nrows_(nrows),
       p_d_r_o_(p_d_r_o),
       p_d_c_i_(p_d_c_i),
diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh
index b63ae60f052..ea8e2a9c4ee 100644
--- a/cpp/src/cores/core_number_impl.cuh
+++ b/cpp/src/cores/core_number_impl.cuh
@@ -72,7 +72,7 @@ struct v_to_core_number_t {
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename edge_t>
 struct mult_degree_by_two_t {
-  __device__ edge_t operator()(edge_t d) const { return d* edge_t{2}; }
+  __device__ edge_t operator()(edge_t d) const { return d * edge_t{2}; }
 };
 
 }  // namespace
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 3a84cdedfda..92c70fcff20 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -388,9 +388,11 @@ void pagerank(raft::handle_t const& handle,
     handle,
     graph_view,
     edge_weight_view,
-    std::make_optional(raft::device_span<weight_t const>{
-      *precomputed_vertex_out_weight_sums,
-      static_cast<size_t>(graph_view.local_vertex_partition_range_size())}),
+    precomputed_vertex_out_weight_sums
+      ? std::make_optional(raft::device_span<weight_t const>{
+          *precomputed_vertex_out_weight_sums,
+          static_cast<size_t>(graph_view.local_vertex_partition_range_size())})
+      : std::nullopt,
     personalization_vertices
       ? std::make_optional(std::make_tuple(
           raft::device_span<vertex_t const>{*personalization_vertices,
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
new file mode 100644
index 00000000000..a669a127f41
--- /dev/null
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/mtmg/vertex_result_view.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <detail/graph_partition_utils.cuh>
+
+#include <thrust/gather.h>
+
+namespace cugraph {
+namespace mtmg {
+
+template <typename result_t>
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
+  handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view)
+{
+  auto this_gpu_graph_view = graph_view.get(handle);
+
+  rmm::device_uvector<vertex_t> local_vertices(vertices.size(), handle.get_stream());
+  rmm::device_uvector<int> vertex_gpu_ids(vertices.size(), handle.get_stream());
+  rmm::device_uvector<size_t> vertex_pos(vertices.size(), handle.get_stream());
+  rmm::device_uvector<result_t> result(vertices.size(), handle.get_stream());
+
+  raft::copy(local_vertices.data(), vertices.data(), vertices.size(), handle.get_stream());
+  cugraph::detail::scalar_fill(
+    handle.get_stream(), vertex_gpu_ids.data(), vertex_gpu_ids.size(), handle.get_rank());
+  cugraph::detail::sequence_fill(
+    handle.get_stream(), vertex_pos.data(), vertex_pos.size(), size_t{0});
+
+  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+    this_gpu_graph_view.vertex_partition_range_lasts().size(), handle.get_stream());
+  raft::update_device(d_vertex_partition_range_lasts.data(),
+                      this_gpu_graph_view.vertex_partition_range_lasts().data(),
+                      this_gpu_graph_view.vertex_partition_range_lasts().size(),
+                      handle.get_stream());
+
+  if (renumber_map_view) {
+    cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
+      handle.raft_handle(),
+      local_vertices.data(),
+      local_vertices.size(),
+      renumber_map_view->get(handle).data(),
+      this_gpu_graph_view.local_vertex_partition_range_first(),
+      this_gpu_graph_view.local_vertex_partition_range_last());
+  }
+
+  auto const major_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::major_comm_name()).get_size();
+  auto const minor_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size();
+
+  std::forward_as_tuple(local_vertices, std::tie(vertex_gpu_ids, vertex_pos), std::ignore) =
+    groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.raft_handle().get_comms(),
+      local_vertices.begin(),
+      local_vertices.end(),
+      thrust::make_zip_iterator(vertex_gpu_ids.begin(), vertex_pos.begin()),
+      cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size},
+      handle.get_stream());
+
+  //
+  //  Now gather
+  //
+  rmm::device_uvector<result_t> tmp_result(local_vertices.size(), handle.get_stream());
+
+  auto& wrapped = this->get(handle);
+
+  auto vertex_partition = vertex_partition_device_view_t<vertex_t, multi_gpu>(
+    this_gpu_graph_view.local_vertex_partition_view());
+
+  auto iter =
+    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+      return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+    });
+
+  thrust::gather(handle.raft_handle().get_thrust_policy(),
+                 iter,
+                 iter + local_vertices.size(),
+                 wrapped.begin(),
+                 tmp_result.begin());
+
+  //
+  // Shuffle back
+  //
+  std::forward_as_tuple(std::ignore, std::tie(std::ignore, vertex_pos, tmp_result), std::ignore) =
+    groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.raft_handle().get_comms(),
+      vertex_gpu_ids.begin(),
+      vertex_gpu_ids.end(),
+      thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
+      [] __device__(int gpu) { return gpu; },
+      handle.get_stream());
+
+  //
+  // Finally, reorder result
+  //
+  thrust::scatter(handle.raft_handle().get_thrust_policy(),
+                  tmp_result.begin(),
+                  tmp_result.end(),
+                  vertex_pos.begin(),
+                  result.begin());
+
+  return result;
+}
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 8490bacfd9c..c46e83aa5da 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -31,6 +31,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index b238b964ede..e6db21f1c7c 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -108,7 +108,7 @@ struct convert_pair_to_quadruplet_t {
             thrust::seq, displacement_first, displacement_first + minor_comm_size, nbr_idx))) -
         1;
       local_nbr_idx -= *(displacement_first + minor_comm_rank);
-      cuda::std::atomic_ref<size_t> counter(tx_counts[minor_comm_rank]);
+      cuda::atomic_ref<size_t, cuda::thread_scope_device> counter(tx_counts[minor_comm_rank]);
       intra_partition_offset = counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
     }
     return thrust::make_tuple(minor_comm_rank, intra_partition_offset, local_nbr_idx, key_idx);
@@ -252,7 +252,7 @@ struct count_t {
 
   __device__ size_t operator()(size_t key_idx) const
   {
-    cuda::std::atomic_ref<int32_t> counter(sample_counts[key_idx]);
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> counter(sample_counts[key_idx]);
     return counter.fetch_add(int32_t{1}, cuda::std::memory_order_relaxed);
   }
 };
@@ -287,7 +287,7 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
 #ifndef NO_CUGRAPH_OPS
   edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
   assert(mid_partition_degree_range_last > K);
-  size_t high_partition_over_sampling_K = K * 2;                         // tuning parameter
+  size_t high_partition_over_sampling_K = K * 2;  // tuning parameter
   assert(high_partition_over_sampling_K > K);
 
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier_degrees.size() * K, handle.get_stream());
@@ -883,7 +883,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
                           // neighbors are distributed in minor_comm_size partitions)
   std::optional<rmm::device_uvector<size_t>> sample_key_indices{
-    std::nullopt};        // relevant only when (minor_comm_size > 1)
+    std::nullopt};  // relevant only when (minor_comm_size > 1)
   auto local_frontier_sample_counts        = std::vector<size_t>{};
   auto local_frontier_sample_displacements = std::vector<size_t>{};
   if (minor_comm_size > 1) {
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 46789c6b8bd..6a7334e9f1a 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -197,19 +197,19 @@ struct col_indx_extract_t {
   void operator()(
     original::device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
     original::device_vec_t<vertex_t> const&
-      d_v_col_indx,       // in: column indices, given by stepper's random engine
+      d_v_col_indx,  // in: column indices, given by stepper's random engine
     original::device_vec_t<vertex_t>&
       d_v_next_vertices,  // out: set of destination vertices, for next step
     original::device_vec_t<weight_t>&
-      d_v_next_weights)   // out: set of weights between src and destination vertices, for next step
+      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
     const
   {
     thrust::transform_if(
       handle_.get_thrust_policy(),
       thrust::make_counting_iterator<index_t>(0),
-      thrust::make_counting_iterator<index_t>(num_paths_),                         // input1
-      d_v_col_indx.begin(),                                                        // input2
-      out_degs_,                                                                   // stencil
+      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
+      d_v_col_indx.begin(),                                 // input2
+      out_degs_,                                            // stencil
       thrust::make_zip_iterator(
         thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
       [max_depth         = max_depth_,
@@ -575,9 +575,9 @@ struct random_walker_t {
       d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
                        // to be used as stencil (don't scatter if 0)
     original::device_vec_t<index_t> const&
-      d_sizes,         // paths sizes used to provide delta in coalesced paths;
-                       // pre-condition: assumed as updated to reflect new vertex additions;
-                       // also, this is the number of _vertices_ in each path;
+      d_sizes,  // paths sizes used to provide delta in coalesced paths;
+                // pre-condition: assumed as updated to reflect new vertex additions;
+                // also, this is the number of _vertices_ in each path;
     // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
     index_t
       stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
@@ -762,7 +762,7 @@ random_walks_impl(
   // pre-allocate num_paths * max_depth;
   //
   original::device_vec_t<vertex_t> d_coalesced_v(num_paths * max_depth,
-                                                 stream);         // coalesced vertex set
+                                                 stream);  // coalesced vertex set
   original::device_vec_t<weight_t> d_coalesced_w(num_paths * (max_depth - 1),
                                                  stream);         // coalesced weight set
   original::device_vec_t<index_t> d_paths_sz(num_paths, stream);  // paths sizes
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh b/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
index 6fdb1c887f2..50f42851a1f 100644
--- a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
+++ b/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
@@ -19,6 +19,7 @@
 #include <prims/kv_store.cuh>
 
 #include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/misc_utils.cuh>
 
 #include <raft/core/handle.hpp>
@@ -41,6 +42,7 @@
 
 #include <optional>
 
+// FIXME: deprecated, to be deleted
 namespace cugraph {
 
 namespace {
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_sg.cu b/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
index 46e2264a0c1..9a5f0d357b2 100644
--- a/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
+++ b/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
 
 #include "renumber_sampled_edgelist_impl.cuh"
 
+// FIXME: deprecated, to be deleted
 namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>,
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
new file mode 100644
index 00000000000..ff8da72ff35
--- /dev/null
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -0,0 +1,1800 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <prims/kv_store.cuh>
+
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/misc_utils.cuh>
+
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/cub.cuh>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+#include <thrust/unique.h>
+
+#include <optional>
+
+namespace cugraph {
+
+namespace {
+
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+struct edge_order_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  raft::device_span<vertex_t const> edgelist_majors{};
+  raft::device_span<vertex_t const> edgelist_minors{};
+
+  __device__ bool operator()(size_t l_idx, size_t r_idx) const
+  {
+    if (edgelist_label_offsets) {
+      auto l_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          (*edgelist_label_offsets)[0] + l_idx));
+      auto r_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          (*edgelist_label_offsets)[0] + r_idx));
+      if (l_label != r_label) { return l_label < r_label; }
+    }
+
+    if (edgelist_hops) {
+      auto l_hop = (*edgelist_hops)[l_idx];
+      auto r_hop = (*edgelist_hops)[r_idx];
+      if (l_hop != r_hop) { return l_hop < r_hop; }
+    }
+
+    auto l_major = edgelist_majors[l_idx];
+    auto r_major = edgelist_majors[r_idx];
+    if (l_major != r_major) { return l_major < r_major; }
+
+    auto l_minor = edgelist_minors[l_idx];
+    auto r_minor = edgelist_minors[r_idx];
+    if (l_minor != r_minor) { return l_minor < r_minor; }
+
+    return l_idx < r_idx;
+  }
+};
+
+template <typename vertex_t>
+struct is_first_in_run_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  raft::device_span<vertex_t const> edgelist_majors{};
+
+  __device__ bool operator()(size_t i) const
+  {
+    if (i == 0) return true;
+    if (edgelist_label_offsets) {
+      auto prev_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                         thrust::upper_bound(thrust::seq,
+                                                             (*edgelist_label_offsets).begin() + 1,
+                                                             (*edgelist_label_offsets).end(),
+                                                             i - 1));
+      auto this_label = thrust::distance(
+        (*edgelist_label_offsets).begin() + 1,
+        thrust::upper_bound(
+          thrust::seq, (*edgelist_label_offsets).begin() + 1, (*edgelist_label_offsets).end(), i));
+      if (this_label != prev_label) { return true; }
+    }
+    if (edgelist_hops) {
+      auto prev_hop = (*edgelist_hops)[i - 1];
+      auto this_hop = (*edgelist_hops)[i];
+      if (this_hop != prev_hop) { return true; }
+    }
+    return edgelist_majors[i] != edgelist_majors[i - 1];
+  }
+};
+
+template <typename label_index_t>
+struct compute_label_index_t {
+  raft::device_span<size_t const> edgelist_label_offsets{};
+
+  __device__ label_index_t operator()(size_t i) const
+  {
+    return static_cast<label_index_t>(thrust::distance(
+      edgelist_label_offsets.begin() + 1,
+      thrust::upper_bound(
+        thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), i)));
+  }
+};
+
+template <typename label_index_t>
+struct optionally_compute_label_index_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+
+  __device__ label_index_t operator()(size_t i) const
+  {
+    return edgelist_label_offsets ? static_cast<label_index_t>(thrust::distance(
+                                      (*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          i)))
+                                  : label_index_t{0};
+  }
+};
+
+template <typename label_index_t,
+          typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+void check_input_edges(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t> const& edgelist_srcs,
+  rmm::device_uvector<vertex_t> const& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>> const& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>> const& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>> const& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>> const& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(!edgelist_label_offsets || (std::get<1>(*edgelist_label_offsets) <=
+                                              std::numeric_limits<label_index_t>::max()),
+                  "Invalid input arguments: current implementation assumes that the number of "
+                  "unique labels is no larger than std::numeric_limits<uint32_t>::max().");
+  CUGRAPH_EXPECTS(!edgelist_label_offsets || std::get<1>(*edgelist_label_offsets) > 0,
+                  "Invlaid input arguments: there should be 1 or more labels if "
+                  "edgelist_label_offsets.has_value() is true.");
+  CUGRAPH_EXPECTS(
+    !edgelist_label_offsets.has_value() ||
+      (std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1),
+    "Invalid input arguments: if edgelist_label_offsets is valid, "
+    "std::get<0>(*edgelist_label_offsets).size() (size of the offset array) should be "
+    "std::get<1>(*edgelist_label_offsets) (number of unique labels) + 1.");
+
+  CUGRAPH_EXPECTS(
+    !edgelist_hops || (std::get<1>(*edgelist_hops) <= std::numeric_limits<int32_t>::max()),
+    "Invalid input arguments: current implementation assumes that the number of "
+    "hops is no larger than std::numeric_limits<int32_t>::max().");
+  CUGRAPH_EXPECTS(!edgelist_hops || std::get<1>(*edgelist_hops) > 0,
+                  "Invlaid input arguments: number of hops should be larger than 0 if "
+                  "edgelist_hops.has_value() is true.");
+
+  CUGRAPH_EXPECTS(
+    edgelist_srcs.size() == edgelist_dsts.size(),
+    "Invalid input arguments: edgelist_srcs.size() and edgelist_dsts.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_weights.has_value() || (edgelist_srcs.size() == (*edgelist_weights).size()),
+    "Invalid input arguments: if edgelist_weights is valid, std::get<0>(*edgelist_weights).size() "
+    "and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_edge_ids.has_value() || (edgelist_srcs.size() == (*edgelist_edge_ids).size()),
+    "Invalid input arguments: if edgelist_edge_ids is valid, "
+    "std::get<0>(*edgelist_edge_ids).size() and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_edge_types.has_value() || (edgelist_srcs.size() == (*edgelist_edge_types).size()),
+    "Invalid input arguments: if edgelist_edge_types is valid, "
+    "std::get<0>(*edgelist_edge_types).size() and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_hops.has_value() || (edgelist_srcs.size() == std::get<0>(*edgelist_hops).size()),
+    "Invalid input arguments: if edgelist_hops is valid, std::get<0>(*edgelist_hops).size() and "
+    "edgelist_srcs.size() should coincide.");
+
+  if (do_expensive_check) {
+    if (edgelist_label_offsets) {
+      CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),
+                                        std::get<0>(*edgelist_label_offsets).begin(),
+                                        std::get<0>(*edgelist_label_offsets).end()),
+                      "Invalid input arguments: if edgelist_label_offsets is valid, "
+                      "std::get<0>(*edgelist_label_offsets) should be sorted.");
+      size_t back_element{};
+      raft::update_host(
+        &back_element,
+        std::get<0>(*edgelist_label_offsets).data() + std::get<1>(*edgelist_label_offsets),
+        size_t{1},
+        handle.get_stream());
+      handle.get_stream();
+      CUGRAPH_EXPECTS(
+        back_element == edgelist_srcs.size(),
+        "Invalid input arguments: if edgelist_label_offsets is valid, the last element of "
+        "std::get<0>(*edgelist_label_offsets) and edgelist_srcs.size() should coincide.");
+    }
+  }
+}
+
+// output sorted by (primary key:label_index, secondary key:vertex)
+template <typename vertex_t, typename label_index_t>
+std::tuple<std::optional<rmm::device_uvector<label_index_t>> /* label indices */,
+           rmm::device_uvector<vertex_t> /* vertices */,
+           std::optional<rmm::device_uvector<int32_t>> /* minimum hops for the vertices */,
+           std::optional<rmm::device_uvector<size_t>> /* label offsets for the output */>
+compute_min_hop_for_unique_label_vertex_pairs(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  std::optional<raft::device_span<int32_t const>> hops,
+  std::optional<raft::device_span<label_index_t const>> label_indices,
+  std::optional<raft::device_span<size_t const>> label_offsets)
+{
+  auto approx_edges_to_sort_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+    (1 << 20) /* tuning parameter */;  // for segmented sort
+
+  if (label_indices) {
+    auto num_labels = (*label_offsets).size() - 1;
+
+    rmm::device_uvector<label_index_t> tmp_label_indices((*label_indices).size(),
+                                                         handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*label_indices).begin(),
+                 (*label_indices).end(),
+                 tmp_label_indices.begin());
+
+    rmm::device_uvector<vertex_t> tmp_vertices(0, handle.get_stream());
+    std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
+
+    if (hops) {
+      tmp_vertices.resize(vertices.size(), handle.get_stream());
+      thrust::copy(
+        handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
+      tmp_hops = rmm::device_uvector<int32_t>((*hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), (*tmp_hops).begin());
+
+      auto triplet_first = thrust::make_zip_iterator(
+        tmp_label_indices.begin(), tmp_vertices.begin(), (*tmp_hops).begin());
+      thrust::sort(
+        handle.get_thrust_policy(), triplet_first, triplet_first + tmp_label_indices.size());
+      auto key_first   = thrust::make_zip_iterator(tmp_label_indices.begin(), tmp_vertices.begin());
+      auto num_uniques = static_cast<size_t>(
+        thrust::distance(key_first,
+                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                              key_first,
+                                                              key_first + tmp_label_indices.size(),
+                                                              (*tmp_hops).begin()))));
+      tmp_label_indices.resize(num_uniques, handle.get_stream());
+      tmp_vertices.resize(num_uniques, handle.get_stream());
+      (*tmp_hops).resize(num_uniques, handle.get_stream());
+      tmp_label_indices.shrink_to_fit(handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+      (*tmp_hops).shrink_to_fit(handle.get_stream());
+    } else {
+      rmm::device_uvector<vertex_t> segment_sorted_vertices(vertices.size(), handle.get_stream());
+
+      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+      auto [h_label_offsets, h_edge_offsets] =
+        detail::compute_offset_aligned_edge_chunks(handle,
+                                                   (*label_offsets).data(),
+                                                   num_labels,
+                                                   vertices.size(),
+                                                   approx_edges_to_sort_per_iteration);
+      auto num_chunks = h_label_offsets.size() - 1;
+
+      for (size_t i = 0; i < num_chunks; ++i) {
+        size_t tmp_storage_bytes{0};
+
+        auto offset_first =
+          thrust::make_transform_iterator((*label_offsets).data() + h_label_offsets[i],
+                                          detail::shift_left_t<size_t>{h_edge_offsets[i]});
+        cub::DeviceSegmentedSort::SortKeys(static_cast<void*>(nullptr),
+                                           tmp_storage_bytes,
+                                           vertices.begin() + h_edge_offsets[i],
+                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
+                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                           h_label_offsets[i + 1] - h_label_offsets[i],
+                                           offset_first,
+                                           offset_first + 1,
+                                           handle.get_stream());
+
+        if (tmp_storage_bytes > d_tmp_storage.size()) {
+          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+        }
+
+        cub::DeviceSegmentedSort::SortKeys(d_tmp_storage.data(),
+                                           tmp_storage_bytes,
+                                           vertices.begin() + h_edge_offsets[i],
+                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
+                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                           h_label_offsets[i + 1] - h_label_offsets[i],
+                                           offset_first,
+                                           offset_first + 1,
+                                           handle.get_stream());
+      }
+      d_tmp_storage.resize(0, handle.get_stream());
+      d_tmp_storage.shrink_to_fit(handle.get_stream());
+
+      auto pair_first =
+        thrust::make_zip_iterator(tmp_label_indices.begin(), segment_sorted_vertices.begin());
+      auto num_uniques = static_cast<size_t>(thrust::distance(
+        pair_first,
+        thrust::unique(
+          handle.get_thrust_policy(), pair_first, pair_first + tmp_label_indices.size())));
+      tmp_label_indices.resize(num_uniques, handle.get_stream());
+      segment_sorted_vertices.resize(num_uniques, handle.get_stream());
+      tmp_label_indices.shrink_to_fit(handle.get_stream());
+      segment_sorted_vertices.shrink_to_fit(handle.get_stream());
+
+      tmp_vertices = std::move(segment_sorted_vertices);
+    }
+
+    rmm::device_uvector<size_t> tmp_label_offsets(num_labels + 1, handle.get_stream());
+    tmp_label_offsets.set_element_to_zero_async(0, handle.get_stream());
+    thrust::upper_bound(handle.get_thrust_policy(),
+                        tmp_label_indices.begin(),
+                        tmp_label_indices.end(),
+                        thrust::make_counting_iterator(size_t{0}),
+                        thrust::make_counting_iterator(num_labels),
+                        tmp_label_offsets.begin() + 1);
+
+    return std::make_tuple(std::move(tmp_label_indices),
+                           std::move(tmp_vertices),
+                           std::move(tmp_hops),
+                           std::move(tmp_label_offsets));
+  } else {
+    rmm::device_uvector<vertex_t> tmp_vertices(vertices.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
+
+    if (hops) {
+      rmm::device_uvector<int32_t> tmp_hops((*hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), tmp_hops.begin());
+
+      auto pair_first = thrust::make_zip_iterator(
+        tmp_vertices.begin(), tmp_hops.begin());  // vertex is a primary key, hop is a secondary key
+      thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_vertices.size());
+      tmp_vertices.resize(
+        thrust::distance(tmp_vertices.begin(),
+                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                              tmp_vertices.begin(),
+                                                              tmp_vertices.end(),
+                                                              tmp_hops.begin()))),
+        handle.get_stream());
+      tmp_hops.resize(tmp_vertices.size(), handle.get_stream());
+
+      return std::make_tuple(
+        std::nullopt, std::move(tmp_vertices), std::move(tmp_hops), std::nullopt);
+    } else {
+      thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end());
+      tmp_vertices.resize(
+        thrust::distance(
+          tmp_vertices.begin(),
+          thrust::unique(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end())),
+        handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+
+      return std::make_tuple(std::nullopt, std::move(tmp_vertices), std::nullopt, std::nullopt);
+    }
+  }
+}
+
+template <typename vertex_t, typename label_index_t>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<label_index_t>>>
+compute_renumber_map(raft::handle_t const& handle,
+                     raft::device_span<vertex_t const> edgelist_majors,
+                     raft::device_span<vertex_t const> edgelist_minors,
+                     std::optional<raft::device_span<int32_t const>> edgelist_hops,
+                     std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
+{
+  auto approx_edges_to_sort_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+    (1 << 20) /* tuning parameter */;  // for segmented sort
+
+  std::optional<rmm::device_uvector<label_index_t>> edgelist_label_indices{std::nullopt};
+  if (edgelist_label_offsets) {
+    edgelist_label_indices =
+      detail::expand_sparse_offsets(*edgelist_label_offsets, label_index_t{0}, handle.get_stream());
+  }
+
+  auto [unique_label_major_pair_label_indices,
+        unique_label_major_pair_vertices,
+        unique_label_major_pair_hops,
+        unique_label_major_pair_label_offsets] =
+    compute_min_hop_for_unique_label_vertex_pairs(
+      handle,
+      edgelist_majors,
+      edgelist_hops,
+      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
+                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
+                             : std::nullopt,
+      edgelist_label_offsets);
+
+  auto [unique_label_minor_pair_label_indices,
+        unique_label_minor_pair_vertices,
+        unique_label_minor_pair_hops,
+        unique_label_minor_pair_label_offsets] =
+    compute_min_hop_for_unique_label_vertex_pairs(
+      handle,
+      edgelist_minors,
+      edgelist_hops,
+      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
+                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
+                             : std::nullopt,
+      edgelist_label_offsets);
+
+  edgelist_label_indices = std::nullopt;
+
+  if (edgelist_label_offsets) {
+    auto num_labels = (*edgelist_label_offsets).size() - 1;
+
+    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+    rmm::device_uvector<label_index_t> renumber_map_label_indices(0, handle.get_stream());
+
+    renumber_map.reserve((*unique_label_major_pair_label_indices).size() +
+                           (*unique_label_minor_pair_label_indices).size(),
+                         handle.get_stream());
+    renumber_map_label_indices.reserve(renumber_map.capacity(), handle.get_stream());
+
+    auto num_chunks = (edgelist_majors.size() + (approx_edges_to_sort_per_iteration - 1)) /
+                      approx_edges_to_sort_per_iteration;
+    auto chunk_size = (num_chunks > 0) ? ((num_labels + (num_chunks - 1)) / num_chunks) : 0;
+
+    size_t copy_offset{0};
+    for (size_t i = 0; i < num_chunks; ++i) {
+      auto major_start_offset =
+        (*unique_label_major_pair_label_offsets).element(chunk_size * i, handle.get_stream());
+      auto major_end_offset =
+        (*unique_label_major_pair_label_offsets)
+          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
+      auto minor_start_offset =
+        (*unique_label_minor_pair_label_offsets).element(chunk_size * i, handle.get_stream());
+      auto minor_end_offset =
+        (*unique_label_minor_pair_label_offsets)
+          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
+
+      rmm::device_uvector<label_index_t> merged_label_indices(
+        (major_end_offset - major_start_offset) + (minor_end_offset - minor_start_offset),
+        handle.get_stream());
+      rmm::device_uvector<vertex_t> merged_vertices(merged_label_indices.size(),
+                                                    handle.get_stream());
+      rmm::device_uvector<int8_t> merged_flags(merged_label_indices.size(), handle.get_stream());
+
+      if (edgelist_hops) {
+        rmm::device_uvector<int32_t> merged_hops(merged_label_indices.size(), handle.get_stream());
+        auto major_quad_first =
+          thrust::make_zip_iterator((*unique_label_major_pair_label_indices).begin(),
+                                    unique_label_major_pair_vertices.begin(),
+                                    (*unique_label_major_pair_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{0}));
+        auto minor_quad_first =
+          thrust::make_zip_iterator((*unique_label_minor_pair_label_indices).begin(),
+                                    unique_label_minor_pair_vertices.begin(),
+                                    (*unique_label_minor_pair_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{1}));
+        thrust::merge(handle.get_thrust_policy(),
+                      major_quad_first + major_start_offset,
+                      major_quad_first + major_end_offset,
+                      minor_quad_first + minor_start_offset,
+                      minor_quad_first + minor_end_offset,
+                      thrust::make_zip_iterator(merged_label_indices.begin(),
+                                                merged_vertices.begin(),
+                                                merged_hops.begin(),
+                                                merged_flags.begin()));
+
+        auto unique_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
+        merged_label_indices.resize(
+          thrust::distance(
+            unique_key_first,
+            thrust::get<0>(thrust::unique_by_key(
+              handle.get_thrust_policy(),
+              unique_key_first,
+              unique_key_first + merged_label_indices.size(),
+              thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+          handle.get_stream());
+        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
+        merged_hops.resize(merged_label_indices.size(), handle.get_stream());
+        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
+        auto sort_key_first = thrust::make_zip_iterator(
+          merged_label_indices.begin(), merged_hops.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_label_indices.size(),
+                            merged_vertices.begin());
+      } else {
+        auto major_triplet_first =
+          thrust::make_zip_iterator((*unique_label_major_pair_label_indices).begin(),
+                                    unique_label_major_pair_vertices.begin(),
+                                    thrust::make_constant_iterator(int8_t{0}));
+        auto minor_triplet_first =
+          thrust::make_zip_iterator((*unique_label_minor_pair_label_indices).begin(),
+                                    unique_label_minor_pair_vertices.begin(),
+                                    thrust::make_constant_iterator(int8_t{1}));
+        thrust::merge(
+          handle.get_thrust_policy(),
+          major_triplet_first + major_start_offset,
+          major_triplet_first + major_end_offset,
+          minor_triplet_first + minor_start_offset,
+          minor_triplet_first + minor_end_offset,
+          thrust::make_zip_iterator(
+            merged_label_indices.begin(), merged_vertices.begin(), merged_flags.begin()));
+
+        auto unique_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
+        merged_label_indices.resize(
+          thrust::distance(
+            unique_key_first,
+            thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                 unique_key_first,
+                                                 unique_key_first + merged_label_indices.size(),
+                                                 merged_flags.begin()))),
+          handle.get_stream());
+        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
+        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
+        auto sort_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_label_indices.size(),
+                            merged_vertices.begin());
+      }
+
+      renumber_map.resize(copy_offset + merged_vertices.size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   merged_vertices.begin(),
+                   merged_vertices.end(),
+                   renumber_map.begin() + copy_offset);
+      renumber_map_label_indices.resize(copy_offset + merged_label_indices.size(),
+                                        handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   merged_label_indices.begin(),
+                   merged_label_indices.end(),
+                   renumber_map_label_indices.begin() + copy_offset);
+
+      copy_offset += merged_vertices.size();
+    }
+
+    renumber_map.shrink_to_fit(handle.get_stream());
+    renumber_map_label_indices.shrink_to_fit(handle.get_stream());
+
+    return std::make_tuple(std::move(renumber_map), std::move(renumber_map_label_indices));
+  } else {
+    if (edgelist_hops) {
+      rmm::device_uvector<vertex_t> merged_vertices(
+        unique_label_major_pair_vertices.size() + unique_label_minor_pair_vertices.size(),
+        handle.get_stream());
+      rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+      rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
+      auto major_triplet_first =
+        thrust::make_zip_iterator(unique_label_major_pair_vertices.begin(),
+                                  (*unique_label_major_pair_hops).begin(),
+                                  thrust::make_constant_iterator(int8_t{0}));
+      auto minor_triplet_first =
+        thrust::make_zip_iterator(unique_label_minor_pair_vertices.begin(),
+                                  (*unique_label_minor_pair_hops).begin(),
+                                  thrust::make_constant_iterator(int8_t{1}));
+      thrust::merge(handle.get_thrust_policy(),
+                    major_triplet_first,
+                    major_triplet_first + unique_label_major_pair_vertices.size(),
+                    minor_triplet_first,
+                    minor_triplet_first + unique_label_minor_pair_vertices.size(),
+                    thrust::make_zip_iterator(
+                      merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
+
+      unique_label_major_pair_vertices.resize(0, handle.get_stream());
+      unique_label_major_pair_vertices.shrink_to_fit(handle.get_stream());
+      unique_label_major_pair_hops = std::nullopt;
+      unique_label_minor_pair_vertices.resize(0, handle.get_stream());
+      unique_label_minor_pair_vertices.shrink_to_fit(handle.get_stream());
+      unique_label_minor_pair_hops = std::nullopt;
+
+      merged_vertices.resize(
+        thrust::distance(merged_vertices.begin(),
+                         thrust::get<0>(thrust::unique_by_key(
+                           handle.get_thrust_policy(),
+                           merged_vertices.begin(),
+                           merged_vertices.end(),
+                           thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+        handle.get_stream());
+      merged_hops.resize(merged_vertices.size(), handle.get_stream());
+      merged_flags.resize(merged_vertices.size(), handle.get_stream());
+
+      auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          merged_vertices.begin());
+
+      return std::make_tuple(std::move(merged_vertices), std::nullopt);
+    } else {
+      rmm::device_uvector<vertex_t> output_vertices(unique_label_minor_pair_vertices.size(),
+                                                    handle.get_stream());
+      auto output_last = thrust::set_difference(handle.get_thrust_policy(),
+                                                unique_label_minor_pair_vertices.begin(),
+                                                unique_label_minor_pair_vertices.end(),
+                                                unique_label_major_pair_vertices.begin(),
+                                                unique_label_major_pair_vertices.end(),
+                                                output_vertices.begin());
+
+      auto num_unique_majors = unique_label_major_pair_vertices.size();
+      auto renumber_map      = std::move(unique_label_major_pair_vertices);
+      renumber_map.resize(
+        renumber_map.size() + thrust::distance(output_vertices.begin(), output_last),
+        handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   output_vertices.begin(),
+                   output_last,
+                   renumber_map.begin() + num_unique_majors);
+
+      return std::make_tuple(std::move(renumber_map), std::nullopt);
+    }
+  }
+}
+
+// this function does not reorder edges (the i'th returned edge is the renumbered output of the i'th
+// input edge)
+template <typename vertex_t, typename label_index_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<size_t>>>
+renumber_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_majors,
+  rmm::device_uvector<vertex_t>&& edgelist_minors,
+  std::optional<std::tuple<raft::device_span<int32_t const>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool do_expensive_check)
+{
+  // 1. compute renumber_map
+
+  auto [renumber_map, renumber_map_label_indices] = compute_renumber_map<vertex_t, label_index_t>(
+    handle,
+    raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+    raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
+    edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                      std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                  : std::nullopt,
+    edgelist_label_offsets
+      ? std::make_optional<raft::device_span<size_t const>>(std::get<0>(*edgelist_label_offsets))
+      : std::nullopt);
+
+  // 2. compute renumber map offsets for each label
+
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{};
+  if (edgelist_label_offsets) {
+    auto num_unique_labels = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator((*renumber_map_label_indices).size()),
+      detail::is_first_in_run_t<label_index_t const*>{(*renumber_map_label_indices).data()});
+    rmm::device_uvector<label_index_t> unique_label_indices(num_unique_labels, handle.get_stream());
+    rmm::device_uvector<vertex_t> vertex_counts(num_unique_labels, handle.get_stream());
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          (*renumber_map_label_indices).begin(),
+                          (*renumber_map_label_indices).end(),
+                          thrust::make_constant_iterator(size_t{1}),
+                          unique_label_indices.begin(),
+                          vertex_counts.begin());
+
+    renumber_map_label_offsets =
+      rmm::device_uvector<size_t>(std::get<1>(*edgelist_label_offsets) + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*renumber_map_label_offsets).begin(),
+                 (*renumber_map_label_offsets).end(),
+                 size_t{0});
+    thrust::scatter(handle.get_thrust_policy(),
+                    vertex_counts.begin(),
+                    vertex_counts.end(),
+                    unique_label_indices.begin(),
+                    (*renumber_map_label_offsets).begin() + 1);
+
+    thrust::inclusive_scan(handle.get_thrust_policy(),
+                           (*renumber_map_label_offsets).begin(),
+                           (*renumber_map_label_offsets).end(),
+                           (*renumber_map_label_offsets).begin());
+  }
+
+  // 3. renumber input edges
+
+  if (edgelist_label_offsets) {
+    rmm::device_uvector<vertex_t> new_vertices(renumber_map.size(), handle.get_stream());
+    thrust::tabulate(handle.get_thrust_policy(),
+                     new_vertices.begin(),
+                     new_vertices.end(),
+                     [label_indices = raft::device_span<label_index_t const>(
+                        (*renumber_map_label_indices).data(), (*renumber_map_label_indices).size()),
+                      renumber_map_label_offsets = raft::device_span<size_t const>(
+                        (*renumber_map_label_offsets).data(),
+                        (*renumber_map_label_offsets).size())] __device__(size_t i) {
+                       auto label_index        = label_indices[i];
+                       auto label_start_offset = renumber_map_label_offsets[label_index];
+                       return static_cast<vertex_t>(i - label_start_offset);
+                     });
+
+    (*renumber_map_label_indices).resize(0, handle.get_stream());
+    (*renumber_map_label_indices).shrink_to_fit(handle.get_stream());
+
+    auto num_labels = std::get<0>(*edgelist_label_offsets).size();
+
+    rmm::device_uvector<vertex_t> segment_sorted_renumber_map(renumber_map.size(),
+                                                              handle.get_stream());
+    rmm::device_uvector<vertex_t> segment_sorted_new_vertices(new_vertices.size(),
+                                                              handle.get_stream());
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+    auto approx_edges_to_sort_per_iteration =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+      (1 << 20) /* tuning parameter */;  // for segmented sort
+
+    auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_edge_chunks(
+      handle,
+      (*renumber_map_label_offsets).data(),
+      static_cast<size_t>((*renumber_map_label_offsets).size() - 1),
+      renumber_map.size(),
+      approx_edges_to_sort_per_iteration);
+    auto num_chunks = h_label_offsets.size() - 1;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+      size_t tmp_storage_bytes{0};
+
+      auto offset_first =
+        thrust::make_transform_iterator((*renumber_map_label_offsets).data() + h_label_offsets[i],
+                                        detail::shift_left_t<size_t>{h_edge_offsets[i]});
+      cub::DeviceSegmentedSort::SortPairs(static_cast<void*>(nullptr),
+                                          tmp_storage_bytes,
+                                          renumber_map.begin() + h_edge_offsets[i],
+                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
+                                          new_vertices.begin() + h_edge_offsets[i],
+                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                          h_label_offsets[i + 1] - h_label_offsets[i],
+                                          offset_first,
+                                          offset_first + 1,
+                                          handle.get_stream());
+
+      if (tmp_storage_bytes > d_tmp_storage.size()) {
+        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+      }
+
+      cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(),
+                                          tmp_storage_bytes,
+                                          renumber_map.begin() + h_edge_offsets[i],
+                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
+                                          new_vertices.begin() + h_edge_offsets[i],
+                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                          h_label_offsets[i + 1] - h_label_offsets[i],
+                                          offset_first,
+                                          offset_first + 1,
+                                          handle.get_stream());
+    }
+    new_vertices.resize(0, handle.get_stream());
+    d_tmp_storage.resize(0, handle.get_stream());
+    new_vertices.shrink_to_fit(handle.get_stream());
+    d_tmp_storage.shrink_to_fit(handle.get_stream());
+
+    auto edgelist_label_indices = detail::expand_sparse_offsets(
+      std::get<0>(*edgelist_label_offsets), label_index_t{0}, handle.get_stream());
+
+    auto pair_first =
+      thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_label_indices.begin());
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_majors.size(),
+      edgelist_majors.begin(),
+      [renumber_map_label_offsets = raft::device_span<size_t const>(
+         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
+                                                        segment_sorted_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(
+         segment_sorted_new_vertices.data(),
+         segment_sorted_new_vertices.size())] __device__(auto pair) {
+        auto old_vertex         = thrust::get<0>(pair);
+        auto label_index        = thrust::get<1>(pair);
+        auto label_start_offset = renumber_map_label_offsets[label_index];
+        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto it                 = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + label_start_offset,
+                                      old_vertices.begin() + label_end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return *(new_vertices.begin() + thrust::distance(old_vertices.begin(), it));
+      });
+
+    pair_first = thrust::make_zip_iterator(edgelist_minors.begin(), edgelist_label_indices.begin());
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_minors.size(),
+      edgelist_minors.begin(),
+      [renumber_map_label_offsets = raft::device_span<size_t const>(
+         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
+                                                        segment_sorted_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(
+         segment_sorted_new_vertices.data(),
+         segment_sorted_new_vertices.size())] __device__(auto pair) {
+        auto old_vertex         = thrust::get<0>(pair);
+        auto label_index        = thrust::get<1>(pair);
+        auto label_start_offset = renumber_map_label_offsets[label_index];
+        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto it                 = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + label_start_offset,
+                                      old_vertices.begin() + label_end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return new_vertices[thrust::distance(old_vertices.begin(), it)];
+      });
+  } else {
+    kv_store_t<vertex_t, vertex_t, false> kv_store(renumber_map.begin(),
+                                                   renumber_map.end(),
+                                                   thrust::make_counting_iterator(vertex_t{0}),
+                                                   std::numeric_limits<vertex_t>::max(),
+                                                   std::numeric_limits<vertex_t>::max(),
+                                                   handle.get_stream());
+    auto kv_store_view = kv_store.view();
+
+    kv_store_view.find(
+      edgelist_majors.begin(), edgelist_majors.end(), edgelist_majors.begin(), handle.get_stream());
+    kv_store_view.find(
+      edgelist_minors.begin(), edgelist_minors.end(), edgelist_minors.begin(), handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(edgelist_majors),
+                         std::move(edgelist_minors),
+                         std::move(renumber_map),
+                         std::move(renumber_map_label_offsets));
+}
+
+template <typename IndexIterator, typename ValueIterator>
+void permute_array(raft::handle_t const& handle,
+                   IndexIterator index_first,
+                   IndexIterator index_last,
+                   ValueIterator value_first /* [INOUT] */)
+{
+  using value_t = typename thrust::iterator_traits<ValueIterator>::value_type;
+
+  auto tmp_buffer = allocate_dataframe_buffer<value_t>(thrust::distance(index_first, index_last),
+                                                       handle.get_stream());
+  thrust::gather(handle.get_thrust_policy(),
+                 index_first,
+                 index_last,
+                 value_first,
+                 get_dataframe_buffer_begin(tmp_buffer));
+  thrust::copy(handle.get_thrust_policy(),
+               get_dataframe_buffer_begin(tmp_buffer),
+               get_dataframe_buffer_end(tmp_buffer),
+               value_first);
+}
+
+// key: ((label), (hop), major, minor)
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_id_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>>
+sort_sampled_edge_tuples(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_majors,
+  rmm::device_uvector<vertex_t>&& edgelist_minors,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets)
+{
+  std::vector<size_t> h_label_offsets{};
+  std::vector<size_t> h_edge_offsets{};
+
+  if (edgelist_label_offsets) {
+    auto approx_edges_to_sort_per_iteration =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+      (1 << 20) /* tuning parameter */;  // for sorts in chunks
+
+    std::tie(h_label_offsets, h_edge_offsets) =
+      detail::compute_offset_aligned_edge_chunks(handle,
+                                                 std::get<0>(*edgelist_label_offsets).data(),
+                                                 std::get<1>(*edgelist_label_offsets),
+                                                 edgelist_majors.size(),
+                                                 approx_edges_to_sort_per_iteration);
+  } else {
+    h_label_offsets = {0, 1};
+    h_edge_offsets  = {0, edgelist_majors.size()};
+  }
+
+  auto num_chunks = h_label_offsets.size() - 1;
+  for (size_t i = 0; i < num_chunks; ++i) {
+    rmm::device_uvector<size_t> indices(h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                        handle.get_stream());
+    thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
+    edge_order_t<vertex_t, weight_t, edge_id_t, edge_type_t> edge_order_comp{
+      edgelist_label_offsets ? thrust::make_optional<raft::device_span<size_t const>>(
+                                 std::get<0>(*edgelist_label_offsets).data() + h_label_offsets[i],
+                                 (h_label_offsets[i + 1] - h_label_offsets[i]) + 1)
+                             : thrust::nullopt,
+      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                        std::get<0>(*edgelist_hops).data() + h_edge_offsets[i], indices.size())
+                    : thrust::nullopt,
+      raft::device_span<vertex_t const>(edgelist_majors.data() + h_edge_offsets[i], indices.size()),
+      raft::device_span<vertex_t const>(edgelist_minors.data() + h_edge_offsets[i],
+                                        indices.size())};
+    thrust::sort(handle.get_thrust_policy(), indices.begin(), indices.end(), edge_order_comp);
+
+    permute_array(handle,
+                  indices.begin(),
+                  indices.end(),
+                  thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()) +
+                    h_edge_offsets[i]);
+
+    if (edgelist_weights) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_weights).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_edge_ids) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_edge_ids).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_edge_types) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_edge_types).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_hops) {
+      permute_array(handle,
+                    indices.begin(),
+                    indices.end(),
+                    std::get<0>(*edgelist_hops).begin() + h_edge_offsets[i]);
+    }
+  }
+
+  return std::make_tuple(std::move(edgelist_majors),
+                         std::move(edgelist_minors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_hops));
+}
+
+}  // namespace
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<std::optional<rmm::device_uvector<vertex_t>>,     // dcsr/dcsc major vertices
+           rmm::device_uvector<size_t>,                      // (d)csr/(d)csc offset values
+           rmm::device_uvector<vertex_t>,                    // minor vertices
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,  // (label, hop) offsets to the (d)csr/(d)csc
+                                                        // offset array
+           rmm::device_uvector<vertex_t>,               // renumber map
+           std::optional<rmm::device_uvector<size_t>>>  // label offsets to the renumber map
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  CUGRAPH_EXPECTS(
+    !doubly_compress || !compress_per_hop,
+    "Invalid input arguments: compress_per_hop should be false if doubly_compress is true.");
+  CUGRAPH_EXPECTS(!compress_per_hop || edgelist_hops,
+                  "Invalid input arguments: edgelist_hops.has_value() should be true if "
+                  "compress_per_hop is true.");
+
+  // 2. renumber
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
+  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+    renumber_sampled_edgelist<vertex_t, label_index_t>(
+      handle,
+      std::move(edgelist_majors),
+      std::move(edgelist_minors),
+      edgelist_hops ? std::make_optional(std::make_tuple(
+                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
+                                                         std::get<0>(*edgelist_hops).size()),
+                        num_hops))
+                    : std::nullopt,
+      edgelist_label_offsets,
+      do_expensive_check);
+
+  // 3. sort by ((l), (h), major, minor)
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  if (do_expensive_check) {
+    if (!compress_per_hop && edgelist_hops) {
+      rmm::device_uvector<vertex_t> min_vertices(num_labels * num_hops, handle.get_stream());
+      rmm::device_uvector<vertex_t> max_vertices(min_vertices.size(), handle.get_stream());
+
+      auto label_index_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        optionally_compute_label_index_t<label_index_t>{
+          edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                 : thrust::nullopt});
+      auto input_key_first =
+        thrust::make_zip_iterator(label_index_first, std::get<0>(*edgelist_hops).begin());
+      rmm::device_uvector<label_index_t> unique_key_label_indices(min_vertices.size(),
+                                                                  handle.get_stream());
+      rmm::device_uvector<int32_t> unique_key_hops(min_vertices.size(), handle.get_stream());
+      auto output_key_first =
+        thrust::make_zip_iterator(unique_key_label_indices.begin(), unique_key_hops.begin());
+
+      auto output_it =
+        thrust::reduce_by_key(handle.get_thrust_policy(),
+                              input_key_first,
+                              input_key_first + edgelist_majors.size(),
+                              edgelist_majors.begin(),
+                              output_key_first,
+                              min_vertices.begin(),
+                              thrust::equal_to<thrust::tuple<label_index_t, int32_t>>{},
+                              thrust::minimum<vertex_t>{});
+      auto num_unique_keys =
+        static_cast<size_t>(thrust::distance(output_key_first, thrust::get<0>(output_it)));
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            edgelist_majors.begin(),
+                            output_key_first,
+                            max_vertices.begin(),
+                            thrust::equal_to<thrust::tuple<label_index_t, int32_t>>{},
+                            thrust::maximum<vertex_t>{});
+      if (num_unique_keys > 1) {
+        auto num_invalids = thrust::count_if(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{1}),
+          thrust::make_counting_iterator(num_unique_keys),
+          [output_key_first,
+           min_vertices = raft::device_span<vertex_t const>(min_vertices.data(), num_unique_keys),
+           max_vertices = raft::device_span<vertex_t const>(max_vertices.data(),
+                                                            num_unique_keys)] __device__(size_t i) {
+            auto prev_key = *(output_key_first + (i - 1));
+            auto this_key = *(output_key_first + i);
+            if (thrust::get<0>(prev_key) == thrust::get<0>(this_key)) {
+              auto this_min = min_vertices[i];
+              auto prev_max = max_vertices[i - 1];
+              return prev_max >= this_min;
+            } else {
+              return false;
+            }
+          });
+        CUGRAPH_EXPECTS(num_invalids == 0,
+                        "Invalid input arguments: if @p compress_per_hop is false and @p "
+                        "edgelist_hops.has_value() is true, the minimum majors with hop N + 1 "
+                        "should be larger than the maximum majors with hop N after renumbering.");
+      }
+    }
+  }
+
+  // 4. compute offsets for ((l), (h), major) triplets with non zero neighbors (update
+  // compressed_label_indices, compressed_hops, compressed_nzd_vertices, and compressed_offsets)
+
+  auto num_uniques = thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator(size_t{0}),
+    thrust::make_counting_iterator(edgelist_majors.size()),
+    is_first_in_run_t<vertex_t>{
+      edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                             : thrust::nullopt,
+      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                        std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                    : thrust::nullopt,
+      raft::device_span<vertex_t const>(
+        edgelist_majors.data(),
+        edgelist_majors.size())});  // number of unique ((label), (hop), major) triplets
+
+  auto compressed_label_indices =
+    edgelist_label_offsets
+      ? std::make_optional<rmm::device_uvector<label_index_t>>(num_uniques, handle.get_stream())
+      : std::nullopt;
+  auto compressed_hops = edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                                           num_uniques, handle.get_stream())
+                                       : std::nullopt;
+  rmm::device_uvector<vertex_t> compressed_nzd_vertices(num_uniques, handle.get_stream());
+  rmm::device_uvector<size_t> compressed_offsets(num_uniques + 1, handle.get_stream());
+  compressed_offsets.set_element_to_zero_async(num_uniques, handle.get_stream());
+
+  if (edgelist_label_offsets) {
+    auto label_index_first = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(size_t{0}),
+      compute_label_index_t<label_index_t>{std::get<0>(*edgelist_label_offsets)});
+
+    if (edgelist_hops) {
+      auto input_key_first = thrust::make_zip_iterator(
+        label_index_first, std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+      auto output_key_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                        (*compressed_hops).begin(),
+                                                        compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    } else {
+      auto input_key_first  = thrust::make_zip_iterator(label_index_first, edgelist_majors.begin());
+      auto output_key_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                        compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    }
+  } else {
+    if (edgelist_hops) {
+      auto input_key_first =
+        thrust::make_zip_iterator(std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+      auto output_key_first =
+        thrust::make_zip_iterator((*compressed_hops).begin(), compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    } else {
+      auto input_key_first  = edgelist_majors.begin();
+      auto output_key_first = compressed_nzd_vertices.begin();
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    }
+  }
+  thrust::exclusive_scan(handle.get_thrust_policy(),
+                         compressed_offsets.begin(),
+                         compressed_offsets.end(),
+                         compressed_offsets.begin());
+
+  // 5. update compressed_offsets to include zero degree vertices (if doubly_compress is false) and
+  // compressed_offset_label_hop_offsets (if edgelist_label_offsets.has_value() or
+  // edgelist_hops.has_value() is true)
+
+  std::optional<rmm::device_uvector<size_t>> compressed_offset_label_hop_offsets{std::nullopt};
+  if (doubly_compress) {
+    if (edgelist_label_offsets || edgelist_hops) {
+      rmm::device_uvector<size_t> offset_array_offsets(num_labels * num_hops + 1,
+                                                       handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   offset_array_offsets.begin(),
+                   offset_array_offsets.end(),
+                   size_t{0});
+
+      if (edgelist_label_offsets) {
+        if (edgelist_hops) {
+          auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                      (*compressed_hops).begin());
+          auto value_pair_first = thrust::make_transform_iterator(
+            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
+              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                        static_cast<int32_t>(i % num_hops));
+            });
+          thrust::upper_bound(handle.get_thrust_policy(),
+                              pair_first,
+                              pair_first + (*compressed_label_indices).size(),
+                              value_pair_first,
+                              value_pair_first + (num_labels * num_hops),
+                              offset_array_offsets.begin() + 1);
+        } else {
+          thrust::upper_bound(
+            handle.get_thrust_policy(),
+            (*compressed_label_indices).begin(),
+            (*compressed_label_indices).end(),
+            thrust::make_counting_iterator(label_index_t{0}),
+            thrust::make_counting_iterator(static_cast<label_index_t>(num_labels)),
+            offset_array_offsets.begin() + 1);
+        }
+      } else {
+        thrust::upper_bound(handle.get_thrust_policy(),
+                            (*compressed_hops).begin(),
+                            (*compressed_hops).end(),
+                            thrust::make_counting_iterator(int32_t{0}),
+                            thrust::make_counting_iterator(static_cast<int32_t>(num_hops)),
+                            offset_array_offsets.begin() + 1);
+      }
+
+      compressed_offset_label_hop_offsets = std::move(offset_array_offsets);
+    }
+  } else {  // !doubly_compress
+    rmm::device_uvector<vertex_t> major_vertex_counts(num_labels * num_hops, handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      major_vertex_counts.begin(),
+      major_vertex_counts.end(),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       edgelist_majors =
+         raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+       num_hops,
+       compress_per_hop] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset         = edgelist_majors.size();
+        auto label_start_offset = start_offset;
+        auto label_end_offset   = end_offset;
+
+        if (edgelist_label_offsets) {
+          auto l_idx         = static_cast<label_index_t>(i / num_hops);
+          start_offset       = (*edgelist_label_offsets)[l_idx];
+          end_offset         = (*edgelist_label_offsets)[l_idx + 1];
+          label_start_offset = start_offset;
+          label_end_offset   = end_offset;
+        }
+
+        if (num_hops > 1) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+        if (compress_per_hop) {
+          return (start_offset < end_offset) ? (edgelist_majors[end_offset - 1] + 1) : vertex_t{0};
+        } else {
+          if (end_offset != label_end_offset) {
+            return edgelist_majors[end_offset];
+          } else if (label_start_offset < label_end_offset) {
+            return edgelist_majors[end_offset - 1] + 1;
+          } else {
+            return vertex_t{0};
+          }
+        }
+      });
+
+    std::optional<rmm::device_uvector<vertex_t>> minor_vertex_counts{std::nullopt};
+    if (compress_per_hop) {
+      minor_vertex_counts =
+        rmm::device_uvector<vertex_t>(major_vertex_counts.size(), handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   (*minor_vertex_counts).begin(),
+                   (*minor_vertex_counts).end(),
+                   vertex_t{0});
+      if (edgelist_label_offsets) {
+        auto triplet_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                       (*compressed_hops).begin(),
+                                                       thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(handle.get_thrust_policy(),
+                         triplet_first,
+                         triplet_first + compressed_nzd_vertices.size(),
+                         [edgelist_minors = raft::device_span<vertex_t const>(
+                            edgelist_minors.data(), edgelist_minors.size()),
+                          compressed_offsets = raft::device_span<size_t const>(
+                            compressed_offsets.data(), compressed_offsets.size()),
+                          minor_vertex_counts = raft::device_span<vertex_t>(
+                            (*minor_vertex_counts).data(), (*minor_vertex_counts).size()),
+                          num_hops] __device__(auto triplet) {
+                           auto nzd_v_idx    = thrust::get<2>(triplet);
+                           size_t end_offset = compressed_offsets[nzd_v_idx + 1];
+                           auto l_idx        = thrust::get<0>(triplet);
+                           auto h            = thrust::get<1>(triplet);
+                           cuda::atomic_ref<vertex_t, cuda::thread_scope_device> minor_vertex_count(
+                             minor_vertex_counts[l_idx * num_hops + h]);
+                           minor_vertex_count.fetch_max(edgelist_minors[end_offset - 1] + 1,
+                                                        cuda::std::memory_order_relaxed);
+                         });
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*compressed_hops).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(handle.get_thrust_policy(),
+                         pair_first,
+                         pair_first + compressed_nzd_vertices.size(),
+                         [edgelist_minors = raft::device_span<vertex_t const>(
+                            edgelist_minors.data(), edgelist_minors.size()),
+                          compressed_offsets = raft::device_span<size_t const>(
+                            compressed_offsets.data(), compressed_offsets.size()),
+                          minor_vertex_counts = raft::device_span<vertex_t>(
+                            (*minor_vertex_counts).data(), (*minor_vertex_counts).size()),
+                          num_hops] __device__(auto pair) {
+                           auto nzd_v_idx    = thrust::get<1>(pair);
+                           size_t end_offset = compressed_offsets[nzd_v_idx + 1];
+                           auto h            = thrust::get<0>(pair);
+                           cuda::atomic_ref<vertex_t, cuda::thread_scope_device> minor_vertex_count(
+                             minor_vertex_counts[h]);
+                           minor_vertex_count.fetch_max(edgelist_minors[end_offset - 1] + 1,
+                                                        cuda::std::memory_order_relaxed);
+                         });
+      }
+    }
+
+    rmm::device_uvector<size_t> offset_array_offsets(num_labels * num_hops + 1,
+                                                     handle.get_stream());
+    offset_array_offsets.set_element_to_zero_async(num_labels * num_hops, handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      offset_array_offsets.begin(),
+      offset_array_offsets.begin() + (num_labels * num_hops),
+      [major_vertex_counts =
+         raft::device_span<vertex_t const>(major_vertex_counts.data(), major_vertex_counts.size()),
+       minor_vertex_counts = minor_vertex_counts
+                               ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                                   (*minor_vertex_counts).data(), (*minor_vertex_counts).size())
+                               : thrust::nullopt,
+       num_hops,
+       compress_per_hop] __device__(size_t i) {
+        auto vertex_count = major_vertex_counts[i];
+        if (num_hops > 1) {
+          if (compress_per_hop) {
+            for (size_t j = (i - (i % num_hops)); j < i; ++j) {
+              vertex_count = cuda::std::max(vertex_count, major_vertex_counts[j]);
+              vertex_count = cuda::std::max(vertex_count, (*minor_vertex_counts)[j]);
+            }
+          } else {
+            if (i % num_hops != 0) { vertex_count -= major_vertex_counts[i - 1]; }
+          }
+        }
+        return vertex_count;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           offset_array_offsets.begin(),
+                           offset_array_offsets.end(),
+                           offset_array_offsets.begin());
+
+    auto tmp_compressed_offsets = rmm::device_uvector<size_t>(
+      offset_array_offsets.back_element(handle.get_stream()) + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 tmp_compressed_offsets.begin(),
+                 tmp_compressed_offsets.end(),
+                 size_t{0});
+
+    if (edgelist_label_offsets) {
+      if (edgelist_hops) {
+        auto triplet_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                       (*compressed_hops).begin(),
+                                                       thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          triplet_first,
+          triplet_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                                              tmp_compressed_offsets.size()),
+           compress_per_hop,
+           num_hops] __device__(auto triplet) {
+            auto nzd_v_idx      = thrust::get<2>(triplet);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto l_idx          = thrust::get<0>(triplet);
+            auto h              = thrust::get<1>(triplet);
+            tmp_compressed_offsets[offset_array_offsets[l_idx * num_hops +
+                                                        (compress_per_hop ? h : int32_t{0})] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          pair_first,
+          pair_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(
+             tmp_compressed_offsets.data(), tmp_compressed_offsets.size())] __device__(auto pair) {
+            auto nzd_v_idx      = thrust::get<1>(pair);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto l_idx          = thrust::get<0>(pair);
+            tmp_compressed_offsets[offset_array_offsets[l_idx] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      }
+    } else {
+      if (edgelist_hops) {
+        auto pair_first = thrust::make_zip_iterator((*compressed_hops).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          pair_first,
+          pair_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                                              tmp_compressed_offsets.size()),
+           compress_per_hop] __device__(auto pair) {
+            auto nzd_v_idx      = thrust::get<1>(pair);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto h              = thrust::get<0>(pair);
+            tmp_compressed_offsets[offset_array_offsets[compress_per_hop ? h : int32_t{0}] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      } else {
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(compressed_nzd_vertices.size()),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets =
+             raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                       tmp_compressed_offsets.size())] __device__(auto nzd_v_idx) {
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            tmp_compressed_offsets[compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      }
+    }
+
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           tmp_compressed_offsets.begin(),
+                           tmp_compressed_offsets.end(),
+                           tmp_compressed_offsets.begin());
+
+    compressed_offsets = std::move(tmp_compressed_offsets);
+
+    if (edgelist_label_offsets || edgelist_hops) {
+      compressed_offset_label_hop_offsets = std::move(offset_array_offsets);
+    }
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(
+    doubly_compress ? std::make_optional(std::move(compressed_nzd_vertices)) : std::nullopt,
+    std::move(compressed_offsets),
+    std::move(edgelist_minors),
+    std::move(edgelist_weights),
+    std::move(edgelist_edge_ids),
+    std::move(edgelist_edge_types),
+    std::move(compressed_offset_label_hop_offsets),
+    std::move(renumber_map),
+    std::move(renumber_map_label_offsets));
+}
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,       // (label, hop) offsets to the edges
+           rmm::device_uvector<vertex_t>,                    // renumber map
+           std::optional<rmm::device_uvector<size_t>>>       // label offsets to the renumber map
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  // 2. renumber
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
+  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+    renumber_sampled_edgelist<vertex_t, label_index_t>(
+      handle,
+      std::move(edgelist_majors),
+      std::move(edgelist_minors),
+      edgelist_hops ? std::make_optional(std::make_tuple(
+                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
+                                                         std::get<0>(*edgelist_hops).size()),
+                        num_hops))
+                    : std::nullopt,
+      edgelist_label_offsets,
+      do_expensive_check);
+
+  // 3. sort by ((l), (h), major, minor)
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  // 4. compute edgelist_label_hop_offsets
+
+  std::optional<rmm::device_uvector<size_t>> edgelist_label_hop_offsets{std::nullopt};
+  if (edgelist_label_offsets || edgelist_hops) {
+    edgelist_label_hop_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_hops + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*edgelist_label_hop_offsets).begin(),
+                 (*edgelist_label_hop_offsets).end(),
+                 size_t{0});
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_labels * num_hops),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       num_hops,
+       num_edges = edgelist_majors.size()] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset = num_edges;
+
+        if (edgelist_label_offsets) {
+          auto l_idx   = static_cast<label_index_t>(i / num_hops);
+          start_offset = (*edgelist_label_offsets)[l_idx];
+          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+        }
+
+        if (edgelist_hops) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+
+        return end_offset - start_offset;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           (*edgelist_label_hop_offsets).begin(),
+                           (*edgelist_label_hop_offsets).end(),
+                           (*edgelist_label_hop_offsets).begin());
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
+                         std::move(src_is_major ? edgelist_minors : edgelist_majors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_label_hop_offsets),
+                         std::move(renumber_map),
+                         std::move(renumber_map_label_offsets));
+}
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  // 2. sort by ((l), (h), major, minor)
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  // 3. compute edgelist_label_hop_offsets
+
+  std::optional<rmm::device_uvector<size_t>> edgelist_label_hop_offsets{std::nullopt};
+  if (edgelist_label_offsets || edgelist_hops) {
+    edgelist_label_hop_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_hops + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*edgelist_label_hop_offsets).begin(),
+                 (*edgelist_label_hop_offsets).end(),
+                 size_t{0});
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_labels * num_hops),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       num_hops,
+       num_edges = edgelist_majors.size()] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset = num_edges;
+
+        if (edgelist_label_offsets) {
+          auto l_idx   = static_cast<label_index_t>(i / num_hops);
+          start_offset = (*edgelist_label_offsets)[l_idx];
+          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+        }
+
+        if (edgelist_hops) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+
+        return end_offset - start_offset;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           (*edgelist_label_hop_offsets).begin(),
+                           (*edgelist_label_hop_offsets).end(),
+                           (*edgelist_label_hop_offsets).begin());
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
+                         std::move(src_is_major ? edgelist_minors : edgelist_majors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_label_hop_offsets));
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/sampling_post_processing_sg.cu b/cpp/src/sampling/sampling_post_processing_sg.cu
new file mode 100644
index 00000000000..75e3c5f005a
--- /dev/null
+++ b/cpp/src/sampling/sampling_post_processing_sg.cu
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/sampling_functions.hpp>
+
+#include "sampling_post_processing_impl.cuh"
+
+namespace cugraph {
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int64_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int64_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index d7381ba71af..6bc19ff4fe1 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -86,7 +86,7 @@ struct find_unused_id_t {
     for (size_t i = worker_id; i < sorted_local_vertices.size() + size_t{1}; i += num_workers) {
       auto start = (i == size_t{0}) ? std::numeric_limits<vertex_t>::lowest()
                                     : sorted_local_vertices[i - size_t{1}];
-      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };            // now inclusive
+      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };  // now inclusive
       auto end = (i == sorted_local_vertices.size()) ? std::numeric_limits<vertex_t>::max()
                                                      : sorted_local_vertices[i];  // exclusive
       for (vertex_t v = start; v < end; ++v) {
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 0402184bd93..437071569bf 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -73,7 +73,7 @@ struct e_op_t {
       if (*(prev_visited_flags + packed_bool_offset(dst)) &
           packed_bool_mask(dst)) {  // check if unvisited in previous iterations
         push = false;
-      } else {                      // check if unvisited in this iteration as well
+      } else {  // check if unvisited in this iteration as well
         auto old = visited_flags.atomic_or(dst, true);
         push     = !old;
       }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6f8c727789f..2a4bb8ab2a5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -390,9 +390,9 @@ ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/sg_uniform_neighbor_sampli
 target_link_libraries(UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
-# - RENUMBER SAMPLED EDGE LIST tests --------------------------------------------------------------
-ConfigureTest(RENUMBER_SAMPLED_EDGELIST_TEST sampling/renumber_sampled_edgelist_test.cu)
-target_link_libraries(RENUMBER_SAMPLED_EDGELIST_TEST PRIVATE cuco::cuco)
+# - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
+ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu)
+target_link_libraries(SAMPLING_POST_PROCESSING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
 # - Renumber tests --------------------------------------------------------------------------------
@@ -419,6 +419,14 @@ ConfigureTest(K_HOP_NBRS_TEST traversal/k_hop_nbrs_test.cpp)
 # - install tests ---------------------------------------------------------------------------------
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcugraph)
 
+###################################################################################################
+# - MTMG tests -------------------------------------------------------------------------
+ConfigureTest(MTMG_TEST mtmg/threaded_test.cu)
+target_link_libraries(MTMG_TEST
+                      PRIVATE
+                      UCP::UCP
+                    )
+
 ###################################################################################################
 # - MG tests --------------------------------------------------------------------------------------
 
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
new file mode 100644
index 00000000000..c5dc2d3c7ce
--- /dev/null
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/edgelist.hpp>
+#include <cugraph/mtmg/graph.hpp>
+#include <cugraph/mtmg/per_thread_edgelist.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+#include <cugraph/mtmg/resource_manager.hpp>
+#include <cugraph/mtmg/vertex_result.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <nccl.h>
+
+#include <vector>
+
+#include <thrust/count.h>
+#include <thrust/unique.h>
+
+struct Multithreaded_Usecase {
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Multithreaded
+  : public ::testing::TestWithParam<std::tuple<Multithreaded_Usecase, input_usecase_t>> {
+ public:
+  Tests_Multithreaded() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  std::vector<int> get_gpu_list()
+  {
+    int num_gpus_per_node{1};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+
+    std::vector<int> gpu_list(num_gpus_per_node);
+    std::iota(gpu_list.begin(), gpu_list.end(), 0);
+
+    return gpu_list;
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool multi_gpu>
+  void run_current_test(
+    std::tuple<Multithreaded_Usecase const&, input_usecase_t const&> const& param,
+    std::vector<int> gpu_list)
+  {
+    using edge_type_t = int32_t;
+
+    constexpr bool renumber           = true;
+    constexpr bool do_expensive_check = false;
+
+    auto [multithreaded_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+
+    result_t constexpr alpha{0.85};
+    result_t constexpr epsilon{1e-6};
+
+    size_t device_buffer_size{64 * 1024 * 1024};
+    size_t thread_buffer_size{4 * 1024 * 1024};
+
+    int num_gpus    = gpu_list.size();
+    int num_threads = num_gpus * 4;
+
+    cugraph::mtmg::resource_manager_t resource_manager;
+
+    std::for_each(gpu_list.begin(), gpu_list.end(), [&resource_manager](int gpu_id) {
+      resource_manager.register_local_gpu(gpu_id, rmm::cuda_device_id{gpu_id});
+    });
+
+    ncclUniqueId instance_manager_id;
+    ncclGetUniqueId(&instance_manager_id);
+
+    auto instance_manager = resource_manager.create_instance_manager(
+      resource_manager.registered_ranks(), instance_manager_id);
+
+    cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, true, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu> graph_view;
+    cugraph::mtmg::vertex_result_t<result_t> pageranks;
+    std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
+      std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
+
+    auto edge_weights = multithreaded_usecase.test_weighted
+                          ? std::make_optional<cugraph::mtmg::edge_property_t<
+                              cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                              weight_t>>()
+                          : std::nullopt;
+
+    //
+    // Simulate graph creation by spawning threads to walk through the
+    // local COO and add edges
+    //
+    std::vector<std::thread> running_threads;
+
+    //  Initialize shared edgelist object, one per GPU
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &edgelist,
+                                    device_buffer_size,
+                                    use_weight    = true,
+                                    use_edge_id   = false,
+                                    use_edge_type = false]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        edgelist.set(thread_handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    // Load SG edge list
+    auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] =
+      input_usecase.template construct_edgelist<vertex_t, weight_t>(
+        handle, multithreaded_usecase.test_weighted, false, false);
+
+    auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
+    auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
+    auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_vertices_v);
+
+    // Load edgelist from different threads.  We'll use more threads than GPUs here
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    thread_buffer_size,
+                                    &edgelist,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+        cugraph::mtmg::per_thread_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>
+          per_thread_edgelist(edgelist.get(thread_handle), thread_buffer_size);
+
+        for (size_t j = i; j < h_src_v.size(); j += num_threads) {
+#if 0
+          if (h_weights_v) {
+            thread_edgelist.append(
+              thread_handle, h_src_v[j], h_dst_v[j], (*h_weights_v)[j], std::nullopt, std::nullopt);
+          } else {
+            thread_edgelist.append(
+              thread_handle, h_src_v[j], h_dst_v[j], std::nullopt, std::nullopt, std::nullopt);
+          }
+#endif
+          per_thread_edgelist.append(
+            thread_handle,
+            h_src_v[j],
+            h_dst_v[j],
+            h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
+            std::nullopt,
+            std::nullopt);
+        }
+
+        per_thread_edgelist.flush(thread_handle);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph,
+                                    &edge_weights,
+                                    &edgelist,
+                                    &renumber_map,
+                                    &pageranks,
+                                    is_symmetric = is_symmetric,
+                                    renumber,
+                                    do_expensive_check]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+          edge_t>>
+          edge_ids{std::nullopt};
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+          int32_t>>
+          edge_types{std::nullopt};
+
+        edgelist.finalize_buffer(thread_handle);
+        edgelist.consolidate_and_shuffle(thread_handle, true);
+
+        cugraph::mtmg::
+          create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, true, multi_gpu>(
+            thread_handle,
+            edgelist,
+            cugraph::graph_properties_t{is_symmetric, true},
+            renumber,
+            graph,
+            edge_weights,
+            edge_ids,
+            edge_types,
+            renumber_map,
+            do_expensive_check);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    graph_view = graph.view();
+
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back(
+        [&instance_manager, &graph_view, &edge_weights, &pageranks, alpha, epsilon]() {
+          auto thread_handle = instance_manager->get_handle();
+
+          if (thread_handle.get_thread_rank() > 0) return;
+
+          auto [local_pageranks, metadata] =
+            cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, true>(
+              thread_handle.raft_handle(),
+              graph_view.get(thread_handle),
+              edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                           : std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              alpha,
+              epsilon,
+              500,
+              true);
+
+          pageranks.set(thread_handle, std::move(local_pageranks));
+        });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    std::vector<std::tuple<std::vector<vertex_t>, std::vector<result_t>>> computed_pageranks_v;
+    std::mutex computed_pageranks_lock{};
+
+    auto pageranks_view    = pageranks.view();
+    auto renumber_map_view = renumber_map ? std::make_optional(renumber_map->view()) : std::nullopt;
+
+    // Load computed_pageranks from different threads.
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &renumber_map_view,
+                                    &pageranks_view,
+                                    &computed_pageranks_lock,
+                                    &computed_pageranks_v,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    &unique_vertices,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        auto number_of_vertices = unique_vertices->size();
+
+        std::vector<vertex_t> my_vertex_list;
+        my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
+
+        for (size_t j = i; j < number_of_vertices; j += num_threads) {
+          my_vertex_list.push_back((*unique_vertices)[j]);
+        }
+
+        rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
+                                                       thread_handle.raft_handle().get_stream());
+        raft::update_device(d_my_vertex_list.data(),
+                            my_vertex_list.data(),
+                            my_vertex_list.size(),
+                            thread_handle.raft_handle().get_stream());
+
+        auto d_my_pageranks = pageranks_view.gather(
+          thread_handle,
+          raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
+          graph_view,
+          renumber_map_view);
+
+        std::vector<result_t> my_pageranks(d_my_pageranks.size());
+        raft::update_host(my_pageranks.data(),
+                          d_my_pageranks.data(),
+                          d_my_pageranks.size(),
+                          thread_handle.raft_handle().get_stream());
+
+        {
+          std::lock_guard<std::mutex> lock(computed_pageranks_lock);
+          computed_pageranks_v.push_back(
+            std::make_tuple(std::move(my_vertex_list), std::move(my_pageranks)));
+        }
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    if (multithreaded_usecase.check_correctness) {
+      // Want to compare the results in computed_pageranks_v with SG results
+      cugraph::graph_t<vertex_t, edge_t, true, false> sg_graph(handle);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::optional<rmm::device_uvector<vertex_t>> sg_renumber_map{std::nullopt};
+
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, sg_renumber_map) = cugraph::
+        create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, true, false>(
+          handle,
+          std::nullopt,
+          std::move(d_src_v),
+          std::move(d_dst_v),
+          std::move(d_weights_v),
+          std::nullopt,
+          std::nullopt,
+          cugraph::graph_properties_t{is_symmetric, true},
+          true);
+
+      auto [sg_pageranks, meta] = cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, false>(
+        handle,
+        sg_graph.view(),
+        sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        alpha,
+        epsilon);
+
+      auto h_sg_pageranks    = cugraph::test::to_host(handle, sg_pageranks);
+      auto h_sg_renumber_map = cugraph::test::to_host(handle, sg_renumber_map);
+      auto compare_functor   = cugraph::test::nearly_equal<weight_t>{
+        weight_t{1e-3},
+        weight_t{(weight_t{1} / static_cast<weight_t>(h_sg_pageranks.size())) * weight_t{1e-3}}};
+
+      std::for_each(
+        computed_pageranks_v.begin(),
+        computed_pageranks_v.end(),
+        [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t1) {
+          std::for_each(
+            thrust::make_zip_iterator(std::get<0>(t1).begin(), std::get<1>(t1).begin()),
+            thrust::make_zip_iterator(std::get<0>(t1).end(), std::get<1>(t1).end()),
+            [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t2) {
+              vertex_t v  = thrust::get<0>(t2);
+              weight_t pr = thrust::get<1>(t2);
+
+              auto pos    = std::find(h_sg_renumber_map->begin(), h_sg_renumber_map->end(), v);
+              auto offset = std::distance(h_sg_renumber_map->begin(), pos);
+
+              ASSERT_TRUE(compare_functor(pr, h_sg_pageranks[offset]))
+                << "vertex " << v << ", SG result = " << h_sg_pageranks[offset]
+                << ", mtmg result = " << pr << ", renumber map = " << (*h_sg_renumber_map)[offset];
+            });
+        });
+    }
+  }
+};
+
+using Tests_Multithreaded_File = Tests_Multithreaded<cugraph::test::File_Usecase>;
+using Tests_Multithreaded_Rmat = Tests_Multithreaded<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_Multithreaded_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+TEST_P(Tests_Multithreaded_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+INSTANTIATE_TEST_SUITE_P(file_test,
+                         Tests_Multithreaded_File,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(Multithreaded_Usecase{false, true},
+                                             Multithreaded_Usecase{true, true}),
+                           ::testing::Values(cugraph::test::File_Usecase("karate.csv"),
+                                             cugraph::test::File_Usecase("dolphins.csv"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(Multithreaded_Usecase{false, true}, Multithreaded_Usecase{true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_File,
+  ::testing::Combine(
+    // disable correctness checks
+    ::testing::Values(Multithreaded_Usecase{false, false}, Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Multithreaded_Usecase{false, false}, Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index b71fe5ddb5e..bca6471a5bb 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -157,8 +157,8 @@ class Tests_MGExtractTransformE
     // 1. create MG graph
 
     constexpr bool is_multi_gpu     = true;
-    constexpr bool renumber         = true;    // needs to be true for multi gpu case
-    constexpr bool store_transposed = false;   // needs to be false for using extract_transform_e
+    constexpr bool renumber         = true;   // needs to be true for multi gpu case
+    constexpr bool store_transposed = false;  // needs to be false for using extract_transform_e
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
diff --git a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu b/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
deleted file mode 100644
index 96c8d6173e7..00000000000
--- a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <utilities/base_fixture.hpp>
-
-#include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/graph_functions.hpp>
-#include <cugraph/utilities/device_functors.cuh>
-#include <cugraph/utilities/high_res_timer.hpp>
-
-#include <raft/core/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <gtest/gtest.h>
-
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-#include <thrust/fill.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-struct RenumberSampledEdgelist_Usecase {
-  size_t num_vertices{};
-  size_t num_sampled_edges{};
-  size_t num_hops{1};    // enabled if larger than 1
-  size_t num_labels{1};  // enabled if larger than 1
-  bool check_correctness{true};
-};
-
-class Tests_RenumberSampledEdgelist
-  : public ::testing::TestWithParam<RenumberSampledEdgelist_Usecase> {
- public:
-  Tests_RenumberSampledEdgelist() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  template <typename vertex_t>
-  void run_current_test(RenumberSampledEdgelist_Usecase const& usecase)
-  {
-    using label_t = int32_t;
-
-    raft::handle_t handle{};
-    HighResTimer hr_timer{};
-
-    raft::random::RngState rng_state(0);
-
-    rmm::device_uvector<vertex_t> org_edgelist_srcs(usecase.num_sampled_edges, handle.get_stream());
-    rmm::device_uvector<vertex_t> org_edgelist_dsts(usecase.num_sampled_edges, handle.get_stream());
-    cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                         org_edgelist_srcs.data(),
-                                         org_edgelist_srcs.size(),
-                                         vertex_t{0},
-                                         static_cast<vertex_t>(usecase.num_vertices),
-                                         rng_state);
-    cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                         org_edgelist_dsts.data(),
-                                         org_edgelist_dsts.size(),
-                                         vertex_t{0},
-                                         static_cast<vertex_t>(usecase.num_vertices),
-                                         rng_state);
-
-    std::optional<rmm::device_uvector<int32_t>> edgelist_hops{std::nullopt};
-    if (usecase.num_hops > 1) {
-      edgelist_hops = rmm::device_uvector<int32_t>(usecase.num_sampled_edges, handle.get_stream());
-      cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                           (*edgelist_hops).data(),
-                                           (*edgelist_hops).size(),
-                                           int32_t{0},
-                                           static_cast<int32_t>(usecase.num_hops),
-                                           rng_state);
-    }
-
-    std::optional<std::tuple<rmm::device_uvector<label_t>, rmm::device_uvector<size_t>>>
-      label_offsets{std::nullopt};
-    if (usecase.num_labels > 1) {
-      rmm::device_uvector<label_t> labels(usecase.num_labels, handle.get_stream());
-      thrust::sequence(handle.get_thrust_policy(), labels.begin(), labels.end(), label_t{0});
-
-      rmm::device_uvector<label_t> edgelist_labels(usecase.num_sampled_edges, handle.get_stream());
-      cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                           edgelist_labels.data(),
-                                           edgelist_labels.size(),
-                                           label_t{0},
-                                           static_cast<label_t>(usecase.num_labels),
-                                           rng_state);
-
-      rmm::device_uvector<size_t> offsets(usecase.num_labels + 1, handle.get_stream());
-      thrust::fill(handle.get_thrust_policy(), offsets.begin(), offsets.end(), size_t{0});
-
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        edgelist_labels.begin(),
-        edgelist_labels.end(),
-        [offsets =
-           raft::device_span<size_t>(offsets.data(), offsets.size())] __device__(label_t label) {
-          cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(offsets[label]);
-          atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
-        });
-
-      thrust::exclusive_scan(
-        handle.get_thrust_policy(), offsets.begin(), offsets.end(), offsets.begin());
-
-      label_offsets = std::make_tuple(std::move(labels), std::move(offsets));
-    }
-
-    rmm::device_uvector<vertex_t> renumbered_edgelist_srcs(org_edgelist_srcs.size(),
-                                                           handle.get_stream());
-    rmm::device_uvector<vertex_t> renumbered_edgelist_dsts(org_edgelist_dsts.size(),
-                                                           handle.get_stream());
-    thrust::copy(handle.get_thrust_policy(),
-                 org_edgelist_srcs.begin(),
-                 org_edgelist_srcs.end(),
-                 renumbered_edgelist_srcs.begin());
-    thrust::copy(handle.get_thrust_policy(),
-                 org_edgelist_dsts.begin(),
-                 org_edgelist_dsts.end(),
-                 renumbered_edgelist_dsts.begin());
-
-    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
-    std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.start("Renumber sampled edgelist");
-    }
-
-    std::tie(renumbered_edgelist_srcs,
-             renumbered_edgelist_dsts,
-             renumber_map,
-             renumber_map_label_offsets) =
-      cugraph::renumber_sampled_edgelist(
-        handle,
-        std::move(renumbered_edgelist_srcs),
-        std::move(renumbered_edgelist_dsts),
-        edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
-                          (*edgelist_hops).data(), (*edgelist_hops).size())
-                      : std::nullopt,
-        label_offsets
-          ? std::make_optional<
-              std::tuple<raft::device_span<label_t const>, raft::device_span<size_t const>>>(
-              std::make_tuple(raft::device_span<label_t const>(std::get<0>(*label_offsets).data(),
-                                                               std::get<0>(*label_offsets).size()),
-                              raft::device_span<size_t const>(std::get<1>(*label_offsets).data(),
-                                                              std::get<1>(*label_offsets).size())))
-          : std::nullopt);
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.stop();
-      hr_timer.display_and_clear(std::cout);
-    }
-
-    if (usecase.check_correctness) {
-      for (size_t i = 0; i < usecase.num_labels; ++i) {
-        size_t edgelist_start_offset =
-          label_offsets ? std::get<1>(*label_offsets).element(i, handle.get_stream()) : size_t{0};
-        size_t edgelist_end_offset =
-          label_offsets ? std::get<1>(*label_offsets).element(i + 1, handle.get_stream())
-                        : usecase.num_sampled_edges;
-        if (edgelist_start_offset == edgelist_end_offset) continue;
-
-        auto this_label_org_edgelist_srcs =
-          raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_org_edgelist_dsts =
-          raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_edgelist_hops = edgelist_hops
-                                          ? std::make_optional<raft::device_span<int32_t const>>(
-                                              (*edgelist_hops).data() + edgelist_start_offset,
-                                              edgelist_end_offset - edgelist_start_offset)
-                                          : std::nullopt;
-        auto this_label_renumbered_edgelist_srcs =
-          raft::device_span<vertex_t const>(renumbered_edgelist_srcs.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_renumbered_edgelist_dsts =
-          raft::device_span<vertex_t const>(renumbered_edgelist_dsts.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-
-        size_t renumber_map_start_offset =
-          renumber_map_label_offsets ? (*renumber_map_label_offsets).element(i, handle.get_stream())
-                                     : size_t{0};
-        size_t renumber_map_end_offset =
-          renumber_map_label_offsets
-            ? (*renumber_map_label_offsets).element(i + 1, handle.get_stream())
-            : renumber_map.size();
-        auto this_label_renumber_map =
-          raft::device_span<vertex_t const>(renumber_map.data() + renumber_map_start_offset,
-                                            renumber_map_end_offset - renumber_map_start_offset);
-
-        // check un-renumbering recovers the original edge list
-
-        auto pair_first = thrust::make_zip_iterator(this_label_org_edgelist_srcs.begin(),
-                                                    this_label_renumbered_edgelist_srcs.begin());
-        auto num_renumber_errors =
-          thrust::count_if(handle.get_thrust_policy(),
-                           pair_first,
-                           pair_first + this_label_org_edgelist_srcs.size(),
-                           [this_label_renumber_map] __device__(auto pair) {
-                             auto org        = thrust::get<0>(pair);
-                             auto renumbered = thrust::get<1>(pair);
-                             return this_label_renumber_map[renumbered] != org;
-                           });
-        ASSERT_TRUE(num_renumber_errors == 0) << "Renumber error in edge list sources.";
-
-        pair_first          = thrust::make_zip_iterator(this_label_org_edgelist_dsts.begin(),
-                                               this_label_renumbered_edgelist_dsts.begin());
-        num_renumber_errors = thrust::count_if(handle.get_thrust_policy(),
-                                               pair_first,
-                                               pair_first + this_label_org_edgelist_dsts.size(),
-                                               [this_label_renumber_map] __device__(auto pair) {
-                                                 auto org        = thrust::get<0>(pair);
-                                                 auto renumbered = thrust::get<1>(pair);
-                                                 return this_label_renumber_map[renumbered] != org;
-                                               });
-        ASSERT_TRUE(num_renumber_errors == 0) << "Renumber error in edge list destinations.";
-
-        // Check the invariants in renumber_map
-        // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique
-        // vertices, where flag is 0 for sources and 1 for destinations. Then, vertices with smaller
-        // (hop, flag) pairs should be renumbered to smaller numbers than vertices with larger (hop,
-        // flag) pairs.
-
-        rmm::device_uvector<vertex_t> unique_srcs(this_label_org_edgelist_srcs.size(),
-                                                  handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_org_edgelist_srcs.begin(),
-                     this_label_org_edgelist_srcs.end(),
-                     unique_srcs.begin());
-        std::optional<rmm::device_uvector<int32_t>> unique_src_hops =
-          this_label_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                                       (*this_label_edgelist_hops).size(), handle.get_stream())
-                                   : std::nullopt;
-        if (this_label_edgelist_hops) {
-          thrust::copy(handle.get_thrust_policy(),
-                       (*this_label_edgelist_hops).begin(),
-                       (*this_label_edgelist_hops).end(),
-                       (*unique_src_hops).begin());
-
-          auto pair_first =
-            thrust::make_zip_iterator(unique_srcs.begin(), (*unique_src_hops).begin());
-          thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_srcs.size());
-          unique_srcs.resize(
-            thrust::distance(unique_srcs.begin(),
-                             thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                                  unique_srcs.begin(),
-                                                                  unique_srcs.end(),
-                                                                  (*unique_src_hops).begin()))),
-            handle.get_stream());
-          (*unique_src_hops).resize(unique_srcs.size(), handle.get_stream());
-        } else {
-          thrust::sort(handle.get_thrust_policy(), unique_srcs.begin(), unique_srcs.end());
-          unique_srcs.resize(
-            thrust::distance(
-              unique_srcs.begin(),
-              thrust::unique(handle.get_thrust_policy(), unique_srcs.begin(), unique_srcs.end())),
-            handle.get_stream());
-        }
-
-        rmm::device_uvector<vertex_t> unique_dsts(this_label_org_edgelist_dsts.size(),
-                                                  handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_org_edgelist_dsts.begin(),
-                     this_label_org_edgelist_dsts.end(),
-                     unique_dsts.begin());
-        std::optional<rmm::device_uvector<int32_t>> unique_dst_hops =
-          this_label_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                                       (*this_label_edgelist_hops).size(), handle.get_stream())
-                                   : std::nullopt;
-        if (this_label_edgelist_hops) {
-          thrust::copy(handle.get_thrust_policy(),
-                       (*this_label_edgelist_hops).begin(),
-                       (*this_label_edgelist_hops).end(),
-                       (*unique_dst_hops).begin());
-
-          auto pair_first =
-            thrust::make_zip_iterator(unique_dsts.begin(), (*unique_dst_hops).begin());
-          thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_dsts.size());
-          unique_dsts.resize(
-            thrust::distance(unique_dsts.begin(),
-                             thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                                  unique_dsts.begin(),
-                                                                  unique_dsts.end(),
-                                                                  (*unique_dst_hops).begin()))),
-            handle.get_stream());
-          (*unique_dst_hops).resize(unique_dsts.size(), handle.get_stream());
-        } else {
-          thrust::sort(handle.get_thrust_policy(), unique_dsts.begin(), unique_dsts.end());
-          unique_dsts.resize(
-            thrust::distance(
-              unique_dsts.begin(),
-              thrust::unique(handle.get_thrust_policy(), unique_dsts.begin(), unique_dsts.end())),
-            handle.get_stream());
-        }
-
-        rmm::device_uvector<vertex_t> sorted_org_vertices(this_label_renumber_map.size(),
-                                                          handle.get_stream());
-        rmm::device_uvector<vertex_t> matching_renumbered_vertices(sorted_org_vertices.size(),
-                                                                   handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_renumber_map.begin(),
-                     this_label_renumber_map.end(),
-                     sorted_org_vertices.begin());
-        thrust::sequence(handle.get_thrust_policy(),
-                         matching_renumbered_vertices.begin(),
-                         matching_renumbered_vertices.end(),
-                         vertex_t{0});
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            sorted_org_vertices.begin(),
-                            sorted_org_vertices.end(),
-                            matching_renumbered_vertices.begin());
-
-        if (this_label_edgelist_hops) {
-          rmm::device_uvector<vertex_t> merged_vertices(unique_srcs.size() + unique_dsts.size(),
-                                                        handle.get_stream());
-          rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
-          rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
-
-          auto src_triplet_first =
-            thrust::make_zip_iterator(unique_srcs.begin(),
-                                      (*unique_src_hops).begin(),
-                                      thrust::make_constant_iterator(int8_t{0}));
-          auto dst_triplet_first =
-            thrust::make_zip_iterator(unique_dsts.begin(),
-                                      (*unique_dst_hops).begin(),
-                                      thrust::make_constant_iterator(int8_t{1}));
-          thrust::merge(handle.get_thrust_policy(),
-                        src_triplet_first,
-                        src_triplet_first + unique_srcs.size(),
-                        dst_triplet_first,
-                        dst_triplet_first + unique_dsts.size(),
-                        thrust::make_zip_iterator(
-                          merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
-          merged_vertices.resize(
-            thrust::distance(
-              merged_vertices.begin(),
-              thrust::get<0>(thrust::unique_by_key(
-                handle.get_thrust_policy(),
-                merged_vertices.begin(),
-                merged_vertices.end(),
-                thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
-            handle.get_stream());
-          merged_hops.resize(merged_vertices.size(), handle.get_stream());
-          merged_flags.resize(merged_vertices.size(), handle.get_stream());
-
-          auto sort_key_first =
-            thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
-          thrust::sort_by_key(handle.get_thrust_policy(),
-                              sort_key_first,
-                              sort_key_first + merged_hops.size(),
-                              merged_vertices.begin());
-
-          auto num_unique_keys = thrust::count_if(
-            handle.get_thrust_policy(),
-            thrust::make_counting_iterator(size_t{0}),
-            thrust::make_counting_iterator(merged_hops.size()),
-            cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
-          rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
-          rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
-
-          auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
-            merged_vertices.begin(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t src) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            });
-
-          thrust::reduce_by_key(handle.get_thrust_policy(),
-                                sort_key_first,
-                                sort_key_first + merged_hops.size(),
-                                renumbered_merged_vertex_first,
-                                thrust::make_discard_iterator(),
-                                min_vertices.begin(),
-                                thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                                thrust::minimum<vertex_t>{});
-          thrust::reduce_by_key(handle.get_thrust_policy(),
-                                sort_key_first,
-                                sort_key_first + merged_hops.size(),
-                                renumbered_merged_vertex_first,
-                                thrust::make_discard_iterator(),
-                                max_vertices.begin(),
-                                thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                                thrust::maximum<vertex_t>{});
-
-          auto num_violations =
-            thrust::count_if(handle.get_thrust_policy(),
-                             thrust::make_counting_iterator(size_t{1}),
-                             thrust::make_counting_iterator(min_vertices.size()),
-                             [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(),
-                                                                               min_vertices.size()),
-                              max_vertices = raft::device_span<vertex_t const>(
-                                max_vertices.data(), max_vertices.size())] __device__(size_t i) {
-                               return min_vertices[i] <= max_vertices[i - 1];
-                             });
-
-          ASSERT_TRUE(num_violations == 0)
-            << "Invariant violated, a vertex with a smaller (hop,flag) pair is renumbered to a "
-               "larger value than a vertex with a larger (hop, flag) pair.";
-        } else {
-          unique_dsts.resize(
-            thrust::distance(
-              unique_dsts.begin(),
-              thrust::remove_if(handle.get_thrust_policy(),
-                                unique_dsts.begin(),
-                                unique_dsts.end(),
-                                [sorted_unique_srcs = raft::device_span<vertex_t const>(
-                                   unique_srcs.data(), unique_srcs.size())] __device__(auto dst) {
-                                  return thrust::binary_search(thrust::seq,
-                                                               sorted_unique_srcs.begin(),
-                                                               sorted_unique_srcs.end(),
-                                                               dst);
-                                })),
-            handle.get_stream());
-
-          auto max_src_renumbered_vertex = thrust::transform_reduce(
-            handle.get_thrust_policy(),
-            unique_srcs.begin(),
-            unique_srcs.end(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t src) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            },
-            std::numeric_limits<vertex_t>::lowest(),
-            thrust::maximum<vertex_t>{});
-
-          auto min_dst_renumbered_vertex = thrust::transform_reduce(
-            handle.get_thrust_policy(),
-            unique_dsts.begin(),
-            unique_dsts.end(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t dst) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), dst);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            },
-            std::numeric_limits<vertex_t>::max(),
-            thrust::minimum<vertex_t>{});
-
-          ASSERT_TRUE(max_src_renumbered_vertex < min_dst_renumbered_vertex)
-            << "Invariants violated, a source vertex is renumbered to a non-smaller value than a "
-               "vertex that appear only in the edge list destinations.";
-        }
-      }
-    }
-  }
-};
-
-TEST_P(Tests_RenumberSampledEdgelist, CheckInt32)
-{
-  auto param = GetParam();
-  run_current_test<int32_t>(param);
-}
-
-TEST_P(Tests_RenumberSampledEdgelist, CheckInt64)
-{
-  auto param = GetParam();
-  run_current_test<int64_t>(param);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-  small_test,
-  Tests_RenumberSampledEdgelist,
-  ::testing::Values(RenumberSampledEdgelist_Usecase{1024, 4096, 1, 1, true},
-                    RenumberSampledEdgelist_Usecase{1024, 4096, 3, 1, true},
-                    RenumberSampledEdgelist_Usecase{1024, 32768, 1, 256, true},
-                    RenumberSampledEdgelist_Usecase{1024, 32768, 3, 256, true}));
-
-INSTANTIATE_TEST_SUITE_P(
-  benchmark_test,
-  Tests_RenumberSampledEdgelist,
-  ::testing::Values(RenumberSampledEdgelist_Usecase{1 << 20, 1 << 20, 1, 1, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 20, 5, 1, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 24, 1, 1 << 20, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 24, 5, 1 << 20, false}));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
new file mode 100644
index 00000000000..422fe953b20
--- /dev/null
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -0,0 +1,1457 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/equal.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+struct SamplingPostProcessing_Usecase {
+  size_t num_labels{};
+  size_t num_seeds_per_label{};
+  std::vector<int32_t> fanouts{{-1}};
+  bool sample_with_replacement{false};
+
+  bool src_is_major{true};
+  bool compress_per_hop{false};
+  bool doubly_compress{false};
+  bool check_correctness{true};
+};
+
+template <typename vertex_t, typename weight_t>
+bool compare_edgelist(raft::handle_t const& handle,
+                      raft::device_span<vertex_t const> org_edgelist_srcs,
+                      raft::device_span<vertex_t const> org_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
+                      raft::device_span<vertex_t const> renumbered_edgelist_srcs,
+                      raft::device_span<vertex_t const> renumbered_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
+                      std::optional<raft::device_span<vertex_t const>> renumber_map)
+{
+  if (org_edgelist_srcs.size() != renumbered_edgelist_srcs.size()) { return false; }
+
+  rmm::device_uvector<vertex_t> sorted_org_edgelist_srcs(org_edgelist_srcs.size(),
+                                                         handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_srcs.begin(),
+               org_edgelist_srcs.end(),
+               sorted_org_edgelist_srcs.begin());
+  rmm::device_uvector<vertex_t> sorted_org_edgelist_dsts(org_edgelist_dsts.size(),
+                                                         handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_dsts.begin(),
+               org_edgelist_dsts.end(),
+               sorted_org_edgelist_dsts.begin());
+  auto sorted_org_edgelist_weights = org_edgelist_weights
+                                       ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                           (*org_edgelist_weights).size(), handle.get_stream())
+                                       : std::nullopt;
+  if (sorted_org_edgelist_weights) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_weights).begin(),
+                 (*org_edgelist_weights).end(),
+                 (*sorted_org_edgelist_weights).begin());
+  }
+
+  if (sorted_org_edgelist_weights) {
+    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
+                                                           sorted_org_edgelist_dsts.begin(),
+                                                           (*sorted_org_edgelist_weights).begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_org_edge_first,
+                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
+  } else {
+    auto sorted_org_edge_first =
+      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_org_edge_first,
+                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
+  }
+
+  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_srcs(renumbered_edgelist_srcs.size(),
+                                                                  handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumbered_edgelist_srcs.begin(),
+               renumbered_edgelist_srcs.end(),
+               sorted_unrenumbered_edgelist_srcs.begin());
+  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_dsts(renumbered_edgelist_dsts.size(),
+                                                                  handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumbered_edgelist_dsts.begin(),
+               renumbered_edgelist_dsts.end(),
+               sorted_unrenumbered_edgelist_dsts.begin());
+  auto sorted_unrenumbered_edgelist_weights =
+    renumbered_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                    (*renumbered_edgelist_weights).size(), handle.get_stream())
+                                : std::nullopt;
+  if (sorted_unrenumbered_edgelist_weights) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*renumbered_edgelist_weights).begin(),
+                 (*renumbered_edgelist_weights).end(),
+                 (*sorted_unrenumbered_edgelist_weights).begin());
+  }
+
+  if (renumber_map) {
+    cugraph::unrenumber_int_vertices<vertex_t, false>(
+      handle,
+      sorted_unrenumbered_edgelist_srcs.data(),
+      sorted_unrenumbered_edgelist_srcs.size(),
+      (*renumber_map).data(),
+      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
+    cugraph::unrenumber_int_vertices<vertex_t, false>(
+      handle,
+      sorted_unrenumbered_edgelist_dsts.data(),
+      sorted_unrenumbered_edgelist_dsts.size(),
+      (*renumber_map).data(),
+      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
+  }
+
+  if (sorted_unrenumbered_edgelist_weights) {
+    auto sorted_unrenumbered_edge_first =
+      thrust::make_zip_iterator(sorted_unrenumbered_edgelist_srcs.begin(),
+                                sorted_unrenumbered_edgelist_dsts.begin(),
+                                (*sorted_unrenumbered_edgelist_weights).begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_unrenumbered_edge_first,
+                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
+
+    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
+                                                           sorted_org_edgelist_dsts.begin(),
+                                                           (*sorted_org_edgelist_weights).begin());
+    return thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first);
+  } else {
+    auto sorted_unrenumbered_edge_first = thrust::make_zip_iterator(
+      sorted_unrenumbered_edgelist_srcs.begin(), sorted_unrenumbered_edgelist_dsts.begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_unrenumbered_edge_first,
+                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
+
+    auto sorted_org_edge_first =
+      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
+    return thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first);
+  }
+}
+
+template <typename vertex_t>
+bool check_renumber_map_invariants(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> org_edgelist_srcs,
+  raft::device_span<vertex_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  raft::device_span<vertex_t const> renumber_map,
+  bool src_is_major)
+{
+  // Check the invariants in renumber_map
+  // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique vertices,
+  // where flag is 0 for sources and 1 for destinations. Then, vertices with smaller (hop, flag)
+  // pairs should be renumbered to smaller numbers than vertices with larger (hop, flag) pairs.
+  auto org_edgelist_majors = src_is_major ? org_edgelist_srcs : org_edgelist_dsts;
+  auto org_edgelist_minors = src_is_major ? org_edgelist_dsts : org_edgelist_srcs;
+
+  rmm::device_uvector<vertex_t> unique_majors(org_edgelist_majors.size(), handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_majors.begin(),
+               org_edgelist_majors.end(),
+               unique_majors.begin());
+  std::optional<rmm::device_uvector<int32_t>> unique_major_hops =
+    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                          (*org_edgelist_hops).size(), handle.get_stream())
+                      : std::nullopt;
+  if (org_edgelist_hops) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_hops).begin(),
+                 (*org_edgelist_hops).end(),
+                 (*unique_major_hops).begin());
+
+    auto pair_first =
+      thrust::make_zip_iterator(unique_majors.begin(), (*unique_major_hops).begin());
+    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_majors.size());
+    unique_majors.resize(
+      thrust::distance(unique_majors.begin(),
+                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                            unique_majors.begin(),
+                                                            unique_majors.end(),
+                                                            (*unique_major_hops).begin()))),
+      handle.get_stream());
+    (*unique_major_hops).resize(unique_majors.size(), handle.get_stream());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end());
+    unique_majors.resize(
+      thrust::distance(
+        unique_majors.begin(),
+        thrust::unique(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end())),
+      handle.get_stream());
+  }
+
+  rmm::device_uvector<vertex_t> unique_minors(org_edgelist_minors.size(), handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_minors.begin(),
+               org_edgelist_minors.end(),
+               unique_minors.begin());
+  std::optional<rmm::device_uvector<int32_t>> unique_minor_hops =
+    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                          (*org_edgelist_hops).size(), handle.get_stream())
+                      : std::nullopt;
+  if (org_edgelist_hops) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_hops).begin(),
+                 (*org_edgelist_hops).end(),
+                 (*unique_minor_hops).begin());
+
+    auto pair_first =
+      thrust::make_zip_iterator(unique_minors.begin(), (*unique_minor_hops).begin());
+    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_minors.size());
+    unique_minors.resize(
+      thrust::distance(unique_minors.begin(),
+                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                            unique_minors.begin(),
+                                                            unique_minors.end(),
+                                                            (*unique_minor_hops).begin()))),
+      handle.get_stream());
+    (*unique_minor_hops).resize(unique_minors.size(), handle.get_stream());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end());
+    unique_minors.resize(
+      thrust::distance(
+        unique_minors.begin(),
+        thrust::unique(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end())),
+      handle.get_stream());
+  }
+
+  rmm::device_uvector<vertex_t> sorted_org_vertices(renumber_map.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> matching_renumbered_vertices(sorted_org_vertices.size(),
+                                                             handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumber_map.begin(),
+               renumber_map.end(),
+               sorted_org_vertices.begin());
+  thrust::sequence(handle.get_thrust_policy(),
+                   matching_renumbered_vertices.begin(),
+                   matching_renumbered_vertices.end(),
+                   vertex_t{0});
+  thrust::sort_by_key(handle.get_thrust_policy(),
+                      sorted_org_vertices.begin(),
+                      sorted_org_vertices.end(),
+                      matching_renumbered_vertices.begin());
+
+  if (org_edgelist_hops) {
+    rmm::device_uvector<vertex_t> merged_vertices(unique_majors.size() + unique_minors.size(),
+                                                  handle.get_stream());
+    rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+    rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
+
+    auto major_triplet_first = thrust::make_zip_iterator(unique_majors.begin(),
+                                                         (*unique_major_hops).begin(),
+                                                         thrust::make_constant_iterator(int8_t{0}));
+    auto minor_triplet_first = thrust::make_zip_iterator(unique_minors.begin(),
+                                                         (*unique_minor_hops).begin(),
+                                                         thrust::make_constant_iterator(int8_t{1}));
+    thrust::merge(handle.get_thrust_policy(),
+                  major_triplet_first,
+                  major_triplet_first + unique_majors.size(),
+                  minor_triplet_first,
+                  minor_triplet_first + unique_minors.size(),
+                  thrust::make_zip_iterator(
+                    merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
+    merged_vertices.resize(
+      thrust::distance(merged_vertices.begin(),
+                       thrust::get<0>(thrust::unique_by_key(
+                         handle.get_thrust_policy(),
+                         merged_vertices.begin(),
+                         merged_vertices.end(),
+                         thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+      handle.get_stream());
+    merged_hops.resize(merged_vertices.size(), handle.get_stream());
+    merged_flags.resize(merged_vertices.size(), handle.get_stream());
+
+    auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        sort_key_first,
+                        sort_key_first + merged_hops.size(),
+                        merged_vertices.begin());
+
+    auto num_unique_keys = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(merged_hops.size()),
+      cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
+    rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
+    rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
+
+    auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
+      merged_vertices.begin(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t major) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      });
+
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          renumbered_merged_vertex_first,
+                          thrust::make_discard_iterator(),
+                          min_vertices.begin(),
+                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                          thrust::minimum<vertex_t>{});
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          renumbered_merged_vertex_first,
+                          thrust::make_discard_iterator(),
+                          max_vertices.begin(),
+                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                          thrust::maximum<vertex_t>{});
+
+    auto num_violations = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{1}),
+      thrust::make_counting_iterator(min_vertices.size()),
+      [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(), min_vertices.size()),
+       max_vertices = raft::device_span<vertex_t const>(max_vertices.data(),
+                                                        max_vertices.size())] __device__(size_t i) {
+        return min_vertices[i] <= max_vertices[i - 1];
+      });
+
+    return (num_violations == 0);
+  } else {
+    unique_minors.resize(
+      thrust::distance(
+        unique_minors.begin(),
+        thrust::remove_if(handle.get_thrust_policy(),
+                          unique_minors.begin(),
+                          unique_minors.end(),
+                          [sorted_unique_majors = raft::device_span<vertex_t const>(
+                             unique_majors.data(), unique_majors.size())] __device__(auto minor) {
+                            return thrust::binary_search(thrust::seq,
+                                                         sorted_unique_majors.begin(),
+                                                         sorted_unique_majors.end(),
+                                                         minor);
+                          })),
+      handle.get_stream());
+
+    auto max_major_renumbered_vertex = thrust::transform_reduce(
+      handle.get_thrust_policy(),
+      unique_majors.begin(),
+      unique_majors.end(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t major) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      },
+      std::numeric_limits<vertex_t>::lowest(),
+      thrust::maximum<vertex_t>{});
+
+    auto min_minor_renumbered_vertex = thrust::transform_reduce(
+      handle.get_thrust_policy(),
+      unique_minors.begin(),
+      unique_minors.end(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t minor) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), minor);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      },
+      std::numeric_limits<vertex_t>::max(),
+      thrust::minimum<vertex_t>{});
+
+    return (max_major_renumbered_vertex < min_minor_renumbered_vertex);
+  }
+}
+
+template <typename input_usecase_t>
+class Tests_SamplingPostProcessing
+  : public ::testing::TestWithParam<std::tuple<SamplingPostProcessing_Usecase, input_usecase_t>> {
+ public:
+  Tests_SamplingPostProcessing() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(
+    std::tuple<SamplingPostProcessing_Usecase const&, input_usecase_t const&> const& param)
+  {
+    using label_t     = int32_t;
+    using weight_t    = float;
+    using edge_id_t   = vertex_t;
+    using edge_type_t = int32_t;
+
+    bool constexpr store_transposed = false;
+    bool constexpr renumber         = true;
+    bool constexpr test_weighted    = true;
+
+    auto [sampling_post_processing_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, test_weighted, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(0);
+
+    rmm::device_uvector<vertex_t> starting_vertices(
+      sampling_post_processing_usecase.num_labels *
+        sampling_post_processing_usecase.num_seeds_per_label,
+      handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         starting_vertices.data(),
+                                         starting_vertices.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+    auto starting_vertex_labels = (sampling_post_processing_usecase.num_labels > 1)
+                                    ? std::make_optional<rmm::device_uvector<label_t>>(
+                                        starting_vertices.size(), handle.get_stream())
+                                    : std::nullopt;
+    if (starting_vertex_labels) {
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        (*starting_vertex_labels).begin(),
+        (*starting_vertex_labels).end(),
+        [num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label] __device__(
+          size_t i) { return static_cast<label_t>(i / num_seeds_per_label); });
+    }
+
+    rmm::device_uvector<vertex_t> org_edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> org_edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> org_edgelist_weights{std::nullopt};
+    std::optional<rmm::device_uvector<int32_t>> org_edgelist_hops{std::nullopt};
+    std::optional<rmm::device_uvector<label_t>> org_labels{std::nullopt};
+    std::optional<rmm::device_uvector<size_t>> org_edgelist_label_offsets{std::nullopt};
+    std::tie(org_edgelist_srcs,
+             org_edgelist_dsts,
+             org_edgelist_weights,
+             std::ignore,
+             std::ignore,
+             org_edgelist_hops,
+             org_labels,
+             org_edgelist_label_offsets) = cugraph::uniform_neighbor_sample<vertex_t,
+                                                                            edge_t,
+                                                                            weight_t,
+                                                                            edge_type_t,
+                                                                            label_t,
+                                                                            store_transposed,
+                                                                            false>(
+      handle,
+      graph_view,
+      edge_weight_view,
+      std::nullopt,
+      std::nullopt,
+      raft::device_span<vertex_t const>(starting_vertices.data(), starting_vertices.size()),
+      starting_vertex_labels ? std::make_optional<raft::device_span<label_t const>>(
+                                 (*starting_vertex_labels).data(), (*starting_vertex_labels).size())
+                             : std::nullopt,
+      std::nullopt,
+      raft::host_span<int32_t const>(sampling_post_processing_usecase.fanouts.data(),
+                                     sampling_post_processing_usecase.fanouts.size()),
+      rng_state,
+      sampling_post_processing_usecase.fanouts.size() > 1,
+      sampling_post_processing_usecase.sample_with_replacement,
+      (!sampling_post_processing_usecase.compress_per_hop &&
+       (sampling_post_processing_usecase.fanouts.size() > 1))
+        ? cugraph::prior_sources_behavior_t::EXCLUDE
+        : cugraph::prior_sources_behavior_t::DEFAULT,
+      false);
+
+    if (!sampling_post_processing_usecase.src_is_major) {
+      std::swap(org_edgelist_srcs, org_edgelist_dsts);
+    }
+
+    starting_vertices.resize(0, handle.get_stream());
+    starting_vertices.shrink_to_fit(handle.get_stream());
+    starting_vertex_labels = std::nullopt;
+
+    {
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_srcs(org_edgelist_srcs.size(),
+                                                                        handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_dsts(org_edgelist_dsts.size(),
+                                                                        handle.get_stream());
+      auto renumbered_and_sorted_edgelist_weights =
+        org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                 (*org_edgelist_weights).size(), handle.get_stream())
+                             : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> renumbered_and_sorted_edgelist_edge_ids{
+        std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_sorted_edgelist_edge_types{
+        std::nullopt};
+      auto renumbered_and_sorted_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(renumbered_and_sorted_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(renumbered_and_sorted_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (renumbered_and_sorted_edgelist_weights) {
+        raft::copy((*renumbered_and_sorted_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_sorted_edgelist_hops) {
+        raft::copy(std::get<0>(*renumbered_and_sorted_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_sorted_edgelist_label_hop_offsets{
+        std::nullopt};
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_renumber_map(0, handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_sorted_renumber_map_label_offsets{
+        std::nullopt};
+
+      {
+        size_t free_size{};
+        size_t total_size{};
+        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
+                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
+                  << std::endl;
+      }
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Renumber and sort sampled edgelist");
+      }
+
+      std::tie(renumbered_and_sorted_edgelist_srcs,
+               renumbered_and_sorted_edgelist_dsts,
+               renumbered_and_sorted_edgelist_weights,
+               renumbered_and_sorted_edgelist_edge_ids,
+               renumbered_and_sorted_edgelist_edge_types,
+               renumbered_and_sorted_edgelist_label_hop_offsets,
+               renumbered_and_sorted_renumber_map,
+               renumbered_and_sorted_renumber_map_label_offsets) =
+        cugraph::renumber_and_sort_sampled_edgelist(
+          handle,
+          std::move(renumbered_and_sorted_edgelist_srcs),
+          std::move(renumbered_and_sorted_edgelist_dsts),
+          std::move(renumbered_and_sorted_edgelist_weights),
+          std::move(renumbered_and_sorted_edgelist_edge_ids),
+          std::move(renumbered_and_sorted_edgelist_edge_types),
+          std::move(renumbered_and_sorted_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (renumbered_and_sorted_edgelist_label_hop_offsets) {
+          ASSERT_TRUE((*renumbered_and_sorted_edgelist_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Renumbered and sorted edge list (label,hop) offset array size should coincide with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).begin(),
+                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).end()))
+            << "Renumbered and sorted edge list (label,hop) offset array values should be "
+               "non-decreasing.";
+        }
+
+        if (renumbered_and_sorted_renumber_map_label_offsets) {
+          ASSERT_TRUE((*renumbered_and_sorted_renumber_map_label_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels + 1)
+            << "Renumbered and sorted offset (label, hop) offset array size should coincide with "
+               "the number of labels + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*renumbered_and_sorted_renumber_map_label_offsets).begin(),
+                                        (*renumbered_and_sorted_renumber_map_label_offsets).end()))
+            << "Renumbered and sorted renumber map label offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE(
+            (*renumbered_and_sorted_renumber_map_label_offsets).back_element(handle.get_stream()) ==
+            renumbered_and_sorted_renumber_map.size())
+            << "Renumbered and sorted renumber map label offset array's last value should coincide "
+               "with the renumber map size.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          auto this_label_output_edgelist_srcs = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_edgelist_srcs.data() + edgelist_start_offset,
+            edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_dsts = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_edgelist_dsts.data() + edgelist_start_offset,
+            edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_weights =
+            renumbered_and_sorted_edgelist_weights
+              ? std::make_optional<raft::device_span<weight_t const>>(
+                  (*renumbered_and_sorted_edgelist_weights).data() + edgelist_start_offset,
+                  edgelist_end_offset - edgelist_start_offset)
+              : std::nullopt;
+
+          size_t renumber_map_start_offset =
+            renumbered_and_sorted_renumber_map_label_offsets
+              ? (*renumbered_and_sorted_renumber_map_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t renumber_map_end_offset      = renumbered_and_sorted_renumber_map_label_offsets
+                                                  ? (*renumbered_and_sorted_renumber_map_label_offsets)
+                                                 .element(i + 1, handle.get_stream())
+                                                  : renumbered_and_sorted_renumber_map.size();
+          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_renumber_map.data() + renumber_map_start_offset,
+            renumber_map_end_offset - renumber_map_start_offset);
+
+          // check whether the edges are properly sorted
+
+          auto this_label_output_edgelist_majors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_srcs
+                                                     : this_label_output_edgelist_dsts;
+          auto this_label_output_edgelist_minors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_dsts
+                                                     : this_label_output_edgelist_srcs;
+
+          if (this_label_org_edgelist_hops) {
+            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            for (size_t j = 0; j < num_hops; ++j) {
+              auto hop_start_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                        .element(i * num_hops + j, handle.get_stream()) -
+                                      (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                        .element(i * num_hops, handle.get_stream());
+              auto hop_end_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                      .element(i * num_hops + j + 1, handle.get_stream()) -
+                                    (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                      .element(i * num_hops, handle.get_stream());
+              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                            edge_first + hop_start_offset,
+                                            edge_first + hop_end_offset))
+                << "Renumbered and sorted output edges are not properly sorted.";
+            }
+          } else {
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                          edge_first,
+                                          edge_first + this_label_output_edgelist_majors.size()))
+              << "Renumbered and sorted output edges are not properly sorted.";
+          }
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(compare_edgelist(handle,
+                                       this_label_org_edgelist_srcs,
+                                       this_label_org_edgelist_dsts,
+                                       this_label_org_edgelist_weights,
+                                       this_label_output_edgelist_srcs,
+                                       this_label_output_edgelist_dsts,
+                                       this_label_output_edgelist_weights,
+                                       std::make_optional(this_label_output_renumber_map)))
+            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+               "edgelist.";
+
+          // Check the invariants in renumber_map
+
+          ASSERT_TRUE(check_renumber_map_invariants(handle,
+                                                    this_label_org_edgelist_srcs,
+                                                    this_label_org_edgelist_dsts,
+                                                    this_label_org_edgelist_hops,
+                                                    this_label_output_renumber_map,
+                                                    sampling_post_processing_usecase.src_is_major))
+            << "Renumbered and sorted output renumber map violates invariants.";
+        }
+      }
+    }
+
+    {
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_srcs(
+        org_edgelist_srcs.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_dsts(
+        org_edgelist_dsts.size(), handle.get_stream());
+      auto renumbered_and_compressed_edgelist_weights =
+        org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                 (*org_edgelist_weights).size(), handle.get_stream())
+                             : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> renumbered_and_compressed_edgelist_edge_ids{
+        std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_compressed_edgelist_edge_types{
+        std::nullopt};
+      auto renumbered_and_compressed_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(renumbered_and_compressed_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(renumbered_and_compressed_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (renumbered_and_compressed_edgelist_weights) {
+        raft::copy((*renumbered_and_compressed_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_compressed_edgelist_hops) {
+        raft::copy(std::get<0>(*renumbered_and_compressed_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<vertex_t>> renumbered_and_compressed_nzd_vertices{
+        std::nullopt};
+      rmm::device_uvector<size_t> renumbered_and_compressed_offsets(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_minors(0,
+                                                                              handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_compressed_offset_label_hop_offsets{
+        std::nullopt};
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_renumber_map(0, handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>>
+        renumbered_and_compressed_renumber_map_label_offsets{std::nullopt};
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Renumber and compressed sampled edgelist");
+      }
+
+      std::tie(renumbered_and_compressed_nzd_vertices,
+               renumbered_and_compressed_offsets,
+               renumbered_and_compressed_edgelist_minors,
+               renumbered_and_compressed_edgelist_weights,
+               renumbered_and_compressed_edgelist_edge_ids,
+               renumbered_and_compressed_edgelist_edge_types,
+               renumbered_and_compressed_offset_label_hop_offsets,
+               renumbered_and_compressed_renumber_map,
+               renumbered_and_compressed_renumber_map_label_offsets) =
+        cugraph::renumber_and_compress_sampled_edgelist(
+          handle,
+          std::move(renumbered_and_compressed_edgelist_srcs),
+          std::move(renumbered_and_compressed_edgelist_dsts),
+          std::move(renumbered_and_compressed_edgelist_weights),
+          std::move(renumbered_and_compressed_edgelist_edge_ids),
+          std::move(renumbered_and_compressed_edgelist_edge_types),
+          std::move(renumbered_and_compressed_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major,
+          sampling_post_processing_usecase.compress_per_hop,
+          sampling_post_processing_usecase.doubly_compress);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (renumbered_and_compressed_nzd_vertices) {
+          ASSERT_TRUE(renumbered_and_compressed_offsets.size() ==
+                      (*renumbered_and_compressed_nzd_vertices).size() + 1)
+            << "Renumbered and compressed offset array size should coincide with the number of "
+               "non-zero-degree vertices + 1.";
+        }
+
+        ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                      renumbered_and_compressed_offsets.begin(),
+                                      renumbered_and_compressed_offsets.end()))
+          << "Renumbered and compressed offset array values should be non-decreasing.";
+
+        ASSERT_TRUE(renumbered_and_compressed_offsets.back_element(handle.get_stream()) ==
+                    renumbered_and_compressed_edgelist_minors.size())
+          << "Renumbered and compressed offset array's last value should coincide with the number "
+             "of "
+             "edges.";
+
+        if (renumbered_and_compressed_offset_label_hop_offsets) {
+          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Renumbered and compressed offset (label,hop) offset array size should coincide "
+               "with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(
+            thrust::is_sorted(handle.get_thrust_policy(),
+                              (*renumbered_and_compressed_offset_label_hop_offsets).begin(),
+                              (*renumbered_and_compressed_offset_label_hop_offsets).end()))
+            << "Renumbered and compressed offset (label,hop) offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets)
+                        .back_element(handle.get_stream()) ==
+                      renumbered_and_compressed_offsets.size() - 1)
+            << "Renumbered and compressed offset (label,hop) offset array's last value should "
+               "coincide with the offset array size - 1.";
+        }
+
+        if (renumbered_and_compressed_renumber_map_label_offsets) {
+          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels + 1)
+            << "Renumbered and compressed offset (label, hop) offset array size should coincide "
+               "with "
+               "the number of labels + 1.";
+
+          ASSERT_TRUE(
+            thrust::is_sorted(handle.get_thrust_policy(),
+                              (*renumbered_and_compressed_renumber_map_label_offsets).begin(),
+                              (*renumbered_and_compressed_renumber_map_label_offsets).end()))
+            << "Renumbered and compressed renumber map label offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets)
+                        .back_element(handle.get_stream()) ==
+                      renumbered_and_compressed_renumber_map.size())
+            << "Renumbered and compressed renumber map label offset array's last value should "
+               "coincide with the renumber map size.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          rmm::device_uvector<vertex_t> this_label_output_edgelist_srcs(0, handle.get_stream());
+          rmm::device_uvector<vertex_t> this_label_output_edgelist_dsts(0, handle.get_stream());
+          auto this_label_output_edgelist_weights =
+            renumbered_and_compressed_edgelist_weights
+              ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
+              : std::nullopt;
+          this_label_output_edgelist_srcs.reserve(edgelist_end_offset - edgelist_start_offset,
+                                                  handle.get_stream());
+          this_label_output_edgelist_dsts.reserve(edgelist_end_offset - edgelist_start_offset,
+                                                  handle.get_stream());
+          if (this_label_output_edgelist_weights) {
+            (*this_label_output_edgelist_weights)
+              .reserve(edgelist_end_offset - edgelist_start_offset, handle.get_stream());
+          }
+
+          // decompress
+
+          auto num_hops = sampling_post_processing_usecase.fanouts.size();
+          for (size_t j = 0; j < num_hops; ++j) {
+            auto offset_start_offset = renumbered_and_compressed_offset_label_hop_offsets
+                                         ? (*renumbered_and_compressed_offset_label_hop_offsets)
+                                             .element(i * num_hops + j, handle.get_stream())
+                                         : size_t{0};
+            auto offset_end_offset   = renumbered_and_compressed_offset_label_hop_offsets
+                                         ? ((*renumbered_and_compressed_offset_label_hop_offsets)
+                                            .element(i * num_hops + j + 1, handle.get_stream()) +
+                                          1)
+                                         : renumbered_and_compressed_offsets.size();
+
+            auto base_v =
+              (!sampling_post_processing_usecase.doubly_compress &&
+               !sampling_post_processing_usecase.compress_per_hop && (j > 0))
+                ? static_cast<vertex_t>(offset_start_offset -
+                                        (*renumbered_and_compressed_offset_label_hop_offsets)
+                                          .element(i * num_hops, handle.get_stream()))
+                : vertex_t{0};
+
+            raft::device_span<size_t const> d_offsets(
+              renumbered_and_compressed_offsets.data() + offset_start_offset,
+              offset_end_offset - offset_start_offset);
+            std::vector<size_t> h_offsets(d_offsets.size());
+            raft::update_host(
+              h_offsets.data(), d_offsets.data(), h_offsets.size(), handle.get_stream());
+            handle.sync_stream();
+
+            auto old_size = this_label_output_edgelist_srcs.size();
+            this_label_output_edgelist_srcs.resize(old_size + (h_offsets.back() - h_offsets[0]),
+                                                   handle.get_stream());
+            this_label_output_edgelist_dsts.resize(this_label_output_edgelist_srcs.size(),
+                                                   handle.get_stream());
+            if (this_label_output_edgelist_weights) {
+              (*this_label_output_edgelist_weights)
+                .resize(this_label_output_edgelist_srcs.size(), handle.get_stream());
+            }
+            thrust::transform(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(h_offsets[0]),
+              thrust::make_counting_iterator(h_offsets.back()),
+              (sampling_post_processing_usecase.src_is_major
+                 ? this_label_output_edgelist_srcs.begin()
+                 : this_label_output_edgelist_dsts.begin()) +
+                old_size,
+              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+               nzd_vertices =
+                 renumbered_and_compressed_nzd_vertices
+                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                       (offset_end_offset - offset_start_offset) - 1)
+                   : thrust::nullopt,
+               base_v] __device__(size_t i) {
+                auto idx = static_cast<size_t>(thrust::distance(
+                  offsets.begin() + 1,
+                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                if (nzd_vertices) {
+                  return (*nzd_vertices)[idx];
+                } else {
+                  return base_v + static_cast<vertex_t>(idx);
+                }
+              });
+            thrust::copy(handle.get_thrust_policy(),
+                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
+                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),
+                         (sampling_post_processing_usecase.src_is_major
+                            ? this_label_output_edgelist_dsts.begin()
+                            : this_label_output_edgelist_srcs.begin()) +
+                           old_size);
+            if (this_label_output_edgelist_weights) {
+              thrust::copy(handle.get_thrust_policy(),
+                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets[0],
+                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets.back(),
+                           (*this_label_output_edgelist_weights).begin() + old_size);
+            }
+          }
+
+          size_t renumber_map_start_offset =
+            renumbered_and_compressed_renumber_map_label_offsets
+              ? (*renumbered_and_compressed_renumber_map_label_offsets)
+                  .element(i, handle.get_stream())
+              : size_t{0};
+          size_t renumber_map_end_offset =
+            renumbered_and_compressed_renumber_map_label_offsets
+              ? (*renumbered_and_compressed_renumber_map_label_offsets)
+                  .element(i + 1, handle.get_stream())
+              : renumbered_and_compressed_renumber_map.size();
+          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
+            renumbered_and_compressed_renumber_map.data() + renumber_map_start_offset,
+            renumber_map_end_offset - renumber_map_start_offset);
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(compare_edgelist(
+            handle,
+            this_label_org_edgelist_srcs,
+            this_label_org_edgelist_dsts,
+            this_label_org_edgelist_weights,
+            raft::device_span<vertex_t const>(this_label_output_edgelist_srcs.data(),
+                                              this_label_output_edgelist_srcs.size()),
+            raft::device_span<vertex_t const>(this_label_output_edgelist_dsts.data(),
+                                              this_label_output_edgelist_dsts.size()),
+            this_label_output_edgelist_weights
+              ? std::make_optional<raft::device_span<weight_t const>>(
+                  (*this_label_output_edgelist_weights).data(),
+                  (*this_label_output_edgelist_weights).size())
+              : std::nullopt,
+            std::make_optional(this_label_output_renumber_map)))
+            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+               "edgelist.";
+
+          // Check the invariants in renumber_map
+
+          ASSERT_TRUE(check_renumber_map_invariants(handle,
+                                                    this_label_org_edgelist_srcs,
+                                                    this_label_org_edgelist_dsts,
+                                                    this_label_org_edgelist_hops,
+                                                    this_label_output_renumber_map,
+                                                    sampling_post_processing_usecase.src_is_major))
+            << "Renumbered and sorted output renumber map violates invariants.";
+        }
+      }
+    }
+
+    {
+      rmm::device_uvector<vertex_t> sorted_edgelist_srcs(org_edgelist_srcs.size(),
+                                                         handle.get_stream());
+      rmm::device_uvector<vertex_t> sorted_edgelist_dsts(org_edgelist_dsts.size(),
+                                                         handle.get_stream());
+      auto sorted_edgelist_weights = org_edgelist_weights
+                                       ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                           (*org_edgelist_weights).size(), handle.get_stream())
+                                       : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> sorted_edgelist_edge_ids{std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> sorted_edgelist_edge_types{std::nullopt};
+      auto sorted_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(sorted_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(sorted_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (sorted_edgelist_weights) {
+        raft::copy((*sorted_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (sorted_edgelist_hops) {
+        raft::copy(std::get<0>(*sorted_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<size_t>> sorted_edgelist_label_hop_offsets{std::nullopt};
+
+      {
+        size_t free_size{};
+        size_t total_size{};
+        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
+                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
+                  << std::endl;
+      }
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Sort sampled edgelist");
+      }
+
+      std::tie(sorted_edgelist_srcs,
+               sorted_edgelist_dsts,
+               sorted_edgelist_weights,
+               sorted_edgelist_edge_ids,
+               sorted_edgelist_edge_types,
+               sorted_edgelist_label_hop_offsets) =
+        cugraph::sort_sampled_edgelist(
+          handle,
+          std::move(sorted_edgelist_srcs),
+          std::move(sorted_edgelist_dsts),
+          std::move(sorted_edgelist_weights),
+          std::move(sorted_edgelist_edge_ids),
+          std::move(sorted_edgelist_edge_types),
+          std::move(sorted_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (sorted_edgelist_label_hop_offsets) {
+          ASSERT_TRUE((*sorted_edgelist_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Sorted edge list (label,hop) offset array size should coincide with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*sorted_edgelist_label_hop_offsets).begin(),
+                                        (*sorted_edgelist_label_hop_offsets).end()))
+            << "Sorted edge list (label,hop) offset array values should be "
+               "non-decreasing.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          auto this_label_output_edgelist_srcs =
+            raft::device_span<vertex_t const>(sorted_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_dsts =
+            raft::device_span<vertex_t const>(sorted_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_weights =
+            sorted_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                        (*sorted_edgelist_weights).data() + edgelist_start_offset,
+                                        edgelist_end_offset - edgelist_start_offset)
+                                    : std::nullopt;
+
+          // check whether the edges are properly sorted
+
+          auto this_label_output_edgelist_majors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_srcs
+                                                     : this_label_output_edgelist_dsts;
+          auto this_label_output_edgelist_minors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_dsts
+                                                     : this_label_output_edgelist_srcs;
+
+          if (this_label_org_edgelist_hops) {
+            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            for (size_t j = 0; j < num_hops; ++j) {
+              auto hop_start_offset =
+                (*sorted_edgelist_label_hop_offsets)
+                  .element(i * num_hops + j, handle.get_stream()) -
+                (*sorted_edgelist_label_hop_offsets).element(i * num_hops, handle.get_stream());
+              auto hop_end_offset =
+                (*sorted_edgelist_label_hop_offsets)
+                  .element(i * num_hops + j + 1, handle.get_stream()) -
+                (*sorted_edgelist_label_hop_offsets).element(i * num_hops, handle.get_stream());
+              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                            edge_first + hop_start_offset,
+                                            edge_first + hop_end_offset))
+                << "Renumbered and sorted output edges are not properly sorted.";
+            }
+          } else {
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                          edge_first,
+                                          edge_first + this_label_output_edgelist_majors.size()))
+              << "Renumbered and sorted output edges are not properly sorted.";
+          }
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(
+            compare_edgelist(handle,
+                             this_label_org_edgelist_srcs,
+                             this_label_org_edgelist_dsts,
+                             this_label_org_edgelist_weights,
+                             this_label_output_edgelist_srcs,
+                             this_label_output_edgelist_dsts,
+                             this_label_output_edgelist_weights,
+                             std::optional<raft::device_span<vertex_t const>>{std::nullopt}))
+            << "Sorted edge list does not coincide with the original edgelist.";
+        }
+      }
+    }
+  }
+};
+
+using Tests_SamplingPostProcessing_File = Tests_SamplingPostProcessing<cugraph::test::File_Usecase>;
+using Tests_SamplingPostProcessing_Rmat = Tests_SamplingPostProcessing<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SamplingPostProcessing_File, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt32Int64)
+{
+  run_current_test<int32_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt64Int64)
+{
+  run_current_test<int64_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SamplingPostProcessing_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 4, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, true}),
+    ::testing::Values(cugraph::test::File_Usecase("karate.mtx"),
+                      cugraph::test::File_Usecase("dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SamplingPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, true, false, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_SamplingPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, true, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/mg_sssp_test.cpp b/cpp/tests/traversal/mg_sssp_test.cpp
index b3e96981f96..ea0353c3743 100644
--- a/cpp/tests/traversal/mg_sssp_test.cpp
+++ b/cpp/tests/traversal/mg_sssp_test.cpp
@@ -214,7 +214,7 @@ class Tests_MGSSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, in
         auto h_sg_predecessors = cugraph::test::to_host(*handle_, d_sg_predecessors);
 
         auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end());
-        auto epsilon            = *max_weight_element* weight_t{1e-6};
+        auto epsilon            = *max_weight_element * weight_t{1e-6};
         auto nearly_equal       = [epsilon](auto lhs, auto rhs) {
           return std::fabs(lhs - rhs) < epsilon;
         };
diff --git a/cpp/tests/traversal/sssp_test.cpp b/cpp/tests/traversal/sssp_test.cpp
index 1e77efa11de..ab73d02c912 100644
--- a/cpp/tests/traversal/sssp_test.cpp
+++ b/cpp/tests/traversal/sssp_test.cpp
@@ -222,7 +222,7 @@ class Tests_SSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, inpu
       }
 
       auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
-      auto epsilon            = *max_weight_element* weight_t{1e-6};
+      auto epsilon            = *max_weight_element * weight_t{1e-6};
       auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
 
       ASSERT_TRUE(std::equal(h_reference_distances.begin(),
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 0eff8dedc8f..321a0536e02 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -547,7 +547,7 @@ mg_vertex_property_values_to_sg_vertex_property_values(
   std::optional<raft::device_span<vertex_t const>>
     sg_renumber_map,  // std::nullopt if the SG graph is not renumbered
   std::optional<raft::device_span<vertex_t const>>
-    mg_vertices,      // std::nullopt if the entire local vertex partition range is assumed
+    mg_vertices,  // std::nullopt if the entire local vertex partition range is assumed
   raft::device_span<value_t const> mg_values);
 
 template <typename type_t>
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
index e5acbf34478..3e7f2f076f0 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
@@ -13,6 +13,7 @@
 
 from .base import SparseGraph
 from .gatconv import GATConv
+from .gatv2conv import GATv2Conv
 from .relgraphconv import RelGraphConv
 from .sageconv import SAGEConv
 from .transformerconv import TransformerConv
@@ -20,6 +21,7 @@
 __all__ = [
     "SparseGraph",
     "GATConv",
+    "GATv2Conv",
     "RelGraphConv",
     "SAGEConv",
     "TransformerConv",
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index 0eeaed29d86..307eb33078e 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -17,38 +17,7 @@
 
 torch = import_optional("torch")
 ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class BaseConv(torch.nn.Module):
-    r"""An abstract base class for cugraph-ops nn module."""
-
-    def __init__(self):
-        super().__init__()
-        self._cached_offsets_fg = None
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        raise NotImplementedError
-
-    def forward(self, *args):
-        r"""Runs the forward pass of the module."""
-        raise NotImplementedError
-
-    def pad_offsets(self, offsets: torch.Tensor, size: int) -> torch.Tensor:
-        r"""Pad zero-in-degree nodes to the end of offsets to reach size. This
-        is used to augment offset tensors from DGL blocks (MFGs) to be
-        compatible with cugraph-ops full-graph primitives."""
-        if self._cached_offsets_fg is None:
-            self._cached_offsets_fg = torch.empty(
-                size, dtype=offsets.dtype, device=offsets.device
-            )
-        elif self._cached_offsets_fg.numel() < size:
-            self._cached_offsets_fg.resize_(size)
-
-        self._cached_offsets_fg[: offsets.numel()] = offsets
-        self._cached_offsets_fg[offsets.numel() : size] = offsets[-1]
-
-        return self._cached_offsets_fg[:size]
+dgl = import_optional("dgl")
 
 
 def compress_ids(ids: torch.Tensor, size: int) -> torch.Tensor:
@@ -63,8 +32,9 @@ def decompress_ids(c_ids: torch.Tensor) -> torch.Tensor:
 
 
 class SparseGraph(object):
-    r"""A god-class to store different sparse formats needed by cugraph-ops
-    and facilitate sparse format conversions.
+    r"""A class to create and store different sparse formats needed by
+    cugraph-ops. It always creates a CSC representation and can provide COO- or
+    CSR-format if needed.
 
     Parameters
     ----------
@@ -89,25 +59,43 @@ class SparseGraph(object):
         consists of the sources between `src_indices[cdst_indices[k]]` and
         `src_indices[cdst_indices[k+1]]`.
 
-    dst_ids_is_sorted: bool
-        Whether `dst_ids` has been sorted in an ascending order. When sorted,
-        creating CSC layout is much faster.
+    values: torch.Tensor, optional
+        Values on the edges.
+
+    is_sorted: bool
+        Whether the COO inputs (src_ids, dst_ids, values) have been sorted by
+        `dst_ids` in an ascending order. CSC layout creation is much faster
+        when sorted.
 
     formats: str or tuple of str, optional
-        The desired sparse formats to create for the graph.
+        The desired sparse formats to create for the graph. The formats tuple
+        must include "csc". Default: "csc".
 
     reduce_memory: bool, optional
         When set, the tensors are not required by the desired formats will be
-        set to `None`.
+        set to `None`. Default: True.
 
     Notes
     -----
     For MFGs (sampled graphs), the node ids must have been renumbered.
     """
 
-    supported_formats = {"coo": ("src_ids", "dst_ids"), "csc": ("cdst_ids", "src_ids")}
-
-    all_tensors = set(["src_ids", "dst_ids", "csrc_ids", "cdst_ids"])
+    supported_formats = {
+        "coo": ("_src_ids", "_dst_ids"),
+        "csc": ("_cdst_ids", "_src_ids"),
+        "csr": ("_csrc_ids", "_dst_ids", "_perm_csc2csr"),
+    }
+
+    all_tensors = set(
+        [
+            "_src_ids",
+            "_dst_ids",
+            "_csrc_ids",
+            "_cdst_ids",
+            "_perm_coo2csc",
+            "_perm_csc2csr",
+        ]
+    )
 
     def __init__(
         self,
@@ -116,15 +104,19 @@ def __init__(
         dst_ids: Optional[torch.Tensor] = None,
         csrc_ids: Optional[torch.Tensor] = None,
         cdst_ids: Optional[torch.Tensor] = None,
-        dst_ids_is_sorted: bool = False,
-        formats: Optional[Union[str, Tuple[str]]] = None,
+        values: Optional[torch.Tensor] = None,
+        is_sorted: bool = False,
+        formats: Union[str, Tuple[str]] = "csc",
         reduce_memory: bool = True,
     ):
         self._num_src_nodes, self._num_dst_nodes = size
-        self._dst_ids_is_sorted = dst_ids_is_sorted
+        self._is_sorted = is_sorted
 
         if dst_ids is None and cdst_ids is None:
-            raise ValueError("One of 'dst_ids' and 'cdst_ids' must be given.")
+            raise ValueError(
+                "One of 'dst_ids' and 'cdst_ids' must be given "
+                "to create a SparseGraph."
+            )
 
         if src_ids is not None:
             src_ids = src_ids.contiguous()
@@ -148,21 +140,40 @@ def __init__(
                 )
             cdst_ids = cdst_ids.contiguous()
 
+        if values is not None:
+            values = values.contiguous()
+
         self._src_ids = src_ids
         self._dst_ids = dst_ids
         self._csrc_ids = csrc_ids
         self._cdst_ids = cdst_ids
-        self._perm = None
+        self._values = values
+        self._perm_coo2csc = None
+        self._perm_csc2csr = None
 
         if isinstance(formats, str):
             formats = (formats,)
-
-        if formats is not None:
-            for format_ in formats:
-                assert format_ in SparseGraph.supported_formats
-                self.__getattribute__(f"_create_{format_}")()
         self._formats = formats
 
+        if "csc" not in formats:
+            raise ValueError(
+                f"{self.__class__.__name__}.formats must contain "
+                f"'csc', but got {formats}."
+            )
+
+        # always create csc first
+        if self._cdst_ids is None:
+            if not self._is_sorted:
+                self._dst_ids, self._perm_coo2csc = torch.sort(self._dst_ids)
+                self._src_ids = self._src_ids[self._perm_coo2csc]
+                if self._values is not None:
+                    self._values = self._values[self._perm_coo2csc]
+            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
+
+        for format_ in formats:
+            assert format_ in SparseGraph.supported_formats
+            self.__getattribute__(f"{format_}")()
+
         self._reduce_memory = reduce_memory
         if reduce_memory:
             self.reduce_memory()
@@ -170,8 +181,6 @@ def __init__(
     def reduce_memory(self):
         """Remove the tensors that are not necessary to create the desired sparse
         formats to reduce memory footprint."""
-
-        self._perm = None
         if self._formats is None:
             return
 
@@ -181,16 +190,22 @@ def reduce_memory(self):
         for t in SparseGraph.all_tensors.difference(set(tensors_needed)):
             self.__dict__[t] = None
 
-    def _create_coo(self):
+    def src_ids(self) -> torch.Tensor:
+        return self._src_ids
+
+    def cdst_ids(self) -> torch.Tensor:
+        return self._cdst_ids
+
+    def dst_ids(self) -> torch.Tensor:
         if self._dst_ids is None:
             self._dst_ids = decompress_ids(self._cdst_ids)
+        return self._dst_ids
 
-    def _create_csc(self):
-        if self._cdst_ids is None:
-            if not self._dst_ids_is_sorted:
-                self._dst_ids, self._perm = torch.sort(self._dst_ids)
-                self._src_ids = self._src_ids[self._perm]
-            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
+    def csrc_ids(self) -> torch.Tensor:
+        if self._csrc_ids is None:
+            src_ids, self._perm_csc2csr = torch.sort(self._src_ids)
+            self._csrc_ids = compress_ids(src_ids, self._num_src_nodes)
+        return self._csrc_ids
 
     def num_src_nodes(self):
         return self._num_src_nodes
@@ -198,21 +213,134 @@ def num_src_nodes(self):
     def num_dst_nodes(self):
         return self._num_dst_nodes
 
+    def values(self):
+        return self._values
+
     def formats(self):
         return self._formats
 
-    def coo(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def coo(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         if "coo" not in self.formats():
             raise RuntimeError(
                 "The SparseGraph did not create a COO layout. "
-                "Set 'formats' to include 'coo' when creating the graph."
+                "Set 'formats' list to include 'coo' when creating the graph."
             )
-        return (self._src_ids, self._dst_ids)
+        return self.src_ids(), self.dst_ids(), self._values
 
-    def csc(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def csc(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         if "csc" not in self.formats():
             raise RuntimeError(
                 "The SparseGraph did not create a CSC layout. "
-                "Set 'formats' to include 'csc' when creating the graph."
+                "Set 'formats' list to include 'csc' when creating the graph."
+            )
+        return self.cdst_ids(), self.src_ids(), self._values
+
+    def csr(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        if "csr" not in self.formats():
+            raise RuntimeError(
+                "The SparseGraph did not create a CSR layout. "
+                "Set 'formats' list to include 'csr' when creating the graph."
+            )
+        csrc_ids = self.csrc_ids()
+        dst_ids = self.dst_ids()[self._perm_csc2csr]
+        value = self._values
+        if value is not None:
+            value = value[self._perm_csc2csr]
+        return csrc_ids, dst_ids, value
+
+
+class BaseConv(torch.nn.Module):
+    r"""An abstract base class for cugraph-ops nn module."""
+
+    def __init__(self):
+        super().__init__()
+
+    def reset_parameters(self):
+        r"""Resets all learnable parameters of the module."""
+        raise NotImplementedError
+
+    def forward(self, *args):
+        r"""Runs the forward pass of the module."""
+        raise NotImplementedError
+
+    def get_cugraph_ops_CSC(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        is_bipartite: bool = False,
+        max_in_degree: Optional[int] = None,
+    ) -> ops_torch.CSC:
+        """Create CSC structure needed by cugraph-ops."""
+
+        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
+            raise TypeError(
+                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
+                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
             )
-        return (self._cdst_ids, self._src_ids)
+
+        # TODO: max_in_degree should default to None in pylibcugraphops
+        if max_in_degree is None:
+            max_in_degree = -1
+
+        if isinstance(g, SparseGraph):
+            offsets, indices, _ = g.csc()
+        else:
+            offsets, indices, _ = g.adj_tensors("csc")
+
+        graph = ops_torch.CSC(
+            offsets=offsets,
+            indices=indices,
+            num_src_nodes=g.num_src_nodes(),
+            dst_max_in_degree=max_in_degree,
+            is_bipartite=is_bipartite,
+        )
+
+        return graph
+
+    def get_cugraph_ops_HeteroCSC(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        num_edge_types: int,
+        etypes: Optional[torch.Tensor] = None,
+        is_bipartite: bool = False,
+        max_in_degree: Optional[int] = None,
+    ) -> ops_torch.HeteroCSC:
+        """Create HeteroCSC structure needed by cugraph-ops."""
+
+        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
+            raise TypeError(
+                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
+                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
+            )
+
+        # TODO: max_in_degree should default to None in pylibcugraphops
+        if max_in_degree is None:
+            max_in_degree = -1
+
+        if isinstance(g, SparseGraph):
+            offsets, indices, etypes = g.csc()
+            if etypes is None:
+                raise ValueError(
+                    "SparseGraph must have 'values' to create HeteroCSC. "
+                    "Pass in edge types as 'values' when creating the SparseGraph."
+                )
+            etypes = etypes.int()
+        else:
+            if etypes is None:
+                raise ValueError(
+                    "'etypes' is required when creating HeteroCSC "
+                    "from dgl.DGLHeteroGraph."
+                )
+            offsets, indices, perm = g.adj_tensors("csc")
+            etypes = etypes[perm].int()
+
+        graph = ops_torch.HeteroCSC(
+            offsets=offsets,
+            indices=indices,
+            edge_types=etypes,
+            num_src_nodes=g.num_src_nodes(),
+            num_edge_types=num_edge_types,
+            dst_max_in_degree=max_in_degree,
+            is_bipartite=is_bipartite,
+        )
+
+        return graph
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 239def5b677..8843e61ad89 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -10,13 +10,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for graph attention network layer using the aggregation
-primitives in cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -32,13 +29,15 @@ class GATConv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int, pair of ints
+    in_feats : int or tuple
         Input feature size. A pair denotes feature sizes of source and
         destination nodes.
     out_feats : int
         Output feature size.
     num_heads : int
-        Number of heads in Multi-Head Attention.
+        Number of heads in multi-head attention.
+    feat_drop : float, optional
+        Dropout rate on feature. Defaults: ``0``.
     concat : bool, optional
         If False, the multi-head attentions are averaged instead of concatenated.
         Default: ``True``.
@@ -46,6 +45,15 @@ class GATConv(BaseConv):
         Edge feature size. Default: ``None``.
     negative_slope : float, optional
         LeakyReLU angle of negative slope. Defaults: ``0.2``.
+    residual : bool, optional
+        If True, use residual connection. Defaults: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will
+        be invalid since no message will be passed to those nodes. This is
+        harmful for some applications causing silent performance regression.
+        This module will raise a DGLError if it detects 0-in-degree nodes in
+        input graph. By setting ``True``, it will suppress the check and let the
+        users handle it by themselves. Defaults: ``False``.
     bias : bool, optional
         If True, learns a bias term. Defaults: ``True``.
 
@@ -81,37 +89,46 @@ class GATConv(BaseConv):
             [ 1.6477, -1.9986],
             [ 1.1138, -1.9302]]], device='cuda:0', grad_fn=<ViewBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 200
 
     def __init__(
         self,
         in_feats: Union[int, Tuple[int, int]],
         out_feats: int,
         num_heads: int,
+        feat_drop: float = 0.0,
         concat: bool = True,
         edge_feats: Optional[int] = None,
         negative_slope: float = 0.2,
+        residual: bool = False,
+        allow_zero_in_degree: bool = False,
         bias: bool = True,
     ):
         super().__init__()
         self.in_feats = in_feats
         self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
+        self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.allow_zero_in_degree = allow_zero_in_degree
 
         if isinstance(in_feats, int):
-            self.fc = nn.Linear(in_feats, num_heads * out_feats, bias=False)
+            self.lin = nn.Linear(in_feats, num_heads * out_feats, bias=False)
         else:
-            self.fc_src = nn.Linear(in_feats[0], num_heads * out_feats, bias=False)
-            self.fc_dst = nn.Linear(in_feats[1], num_heads * out_feats, bias=False)
+            self.lin_src = nn.Linear(
+                self.in_feats_src, num_heads * out_feats, bias=False
+            )
+            self.lin_dst = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=False
+            )
 
         if edge_feats is not None:
-            self.fc_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
+            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
             self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
         else:
-            self.register_parameter("fc_edge", None)
+            self.register_parameter("lin_edge", None)
             self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
 
         if bias and concat:
@@ -121,28 +138,40 @@ def __init__(
         else:
             self.register_buffer("bias", None)
 
+        self.residual = residual and self.in_feats_dst != out_feats * num_heads
+        if self.residual:
+            self.lin_res = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+        else:
+            self.register_buffer("lin_res", None)
+
         self.reset_parameters()
 
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
-        if hasattr(self, "fc"):
-            nn.init.xavier_normal_(self.fc.weight, gain=gain)
+        if hasattr(self, "lin"):
+            nn.init.xavier_normal_(self.lin.weight, gain=gain)
         else:
-            nn.init.xavier_normal_(self.fc_src.weight, gain=gain)
-            nn.init.xavier_normal_(self.fc_dst.weight, gain=gain)
+            nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
+            nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
 
         nn.init.xavier_normal_(
             self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
-        if self.fc_edge is not None:
-            self.fc_edge.reset_parameters()
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        if self.lin_res is not None:
+            self.lin_res.reset_parameters()
+
         if self.bias is not None:
             nn.init.zeros_(self.bias)
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
@@ -151,18 +180,17 @@ def forward(
 
         Parameters
         ----------
-        graph : DGLGraph
+        graph : DGLGraph or SparseGraph
             The graph.
         nfeat : torch.Tensor
             Input features of shape :math:`(N, D_{in})`.
         efeat: torch.Tensor, optional
             Optional edge features.
         max_in_degree : int
-            Maximum in-degree of destination nodes. It is only effective when
-            :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
-            :attr:`g` is generated from a neighbor sampler, the value should be
-            set to the corresponding :attr:`fanout`. If not given,
-            :attr:`max_in_degree` will be calculated on-the-fly.
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
 
         Returns
         -------
@@ -171,49 +199,63 @@ def forward(
             :math:`H` is the number of heads, and :math:`D_{out}` is size of
             output feature.
         """
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        bipartite = not isinstance(nfeat, torch.Tensor)
-        offsets, indices, _ = g.adj_tensors("csc")
-
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=bipartite,
+        if isinstance(g, dgl.DGLHeteroGraph):
+            if not self.allow_zero_in_degree:
+                if (g.in_degrees() == 0).any():
+                    raise dgl.base.DGLError(
+                        "There are 0-in-degree nodes in the graph, "
+                        "output for those nodes will be invalid. "
+                        "This is harmful for some applications, "
+                        "causing silent performance regression. "
+                        "Adding self-loop on the input graph by "
+                        "calling `g = dgl.add_self_loop(g)` will resolve "
+                        "the issue. Setting ``allow_zero_in_degree`` "
+                        "to be `True` when constructing this module will "
+                        "suppress the check and let the code run."
+                    )
+
+        bipartite = isinstance(nfeat, (list, tuple))
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=bipartite, max_in_degree=max_in_degree
         )
 
+        if bipartite:
+            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
+            nfeat_dst_orig = nfeat[1]
+        else:
+            nfeat = self.feat_drop(nfeat)
+            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
+
         if efeat is not None:
-            if self.fc_edge is None:
+            if self.lin_edge is None:
                 raise RuntimeError(
                     f"{self.__class__.__name__}.edge_feats must be set to "
                     f"accept edge features."
                 )
-            efeat = self.fc_edge(efeat)
+            efeat = self.lin_edge(efeat)
 
         if bipartite:
-            if not hasattr(self, "fc_src"):
+            if not hasattr(self, "lin_src"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats must be a pair of "
                     f"integers to allow bipartite node features, but got "
                     f"{self.in_feats}."
                 )
-            nfeat_src = self.fc_src(nfeat[0])
-            nfeat_dst = self.fc_dst(nfeat[1])
+            nfeat_src = self.lin_src(nfeat[0])
+            nfeat_dst = self.lin_dst(nfeat[1])
         else:
-            if not hasattr(self, "fc"):
+            if not hasattr(self, "lin"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats is expected to be an "
                     f"integer, but got {self.in_feats}."
                 )
-            nfeat = self.fc(nfeat)
+            nfeat = self.lin(nfeat)
 
         out = ops_torch.operators.mha_gat_n2n(
             (nfeat_src, nfeat_dst) if bipartite else nfeat,
             self.attn_weights,
-            graph,
+            _graph,
             num_heads=self.num_heads,
             activation="LeakyReLU",
             negative_slope=self.negative_slope,
@@ -224,6 +266,12 @@ def forward(
         if self.concat:
             out = out.view(-1, self.num_heads, self.out_feats)
 
+        if self.residual:
+            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
+            if not self.concat:
+                res = res.mean(dim=1)
+            out = out + res
+
         if self.bias is not None:
             out = out + self.bias
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
new file mode 100644
index 00000000000..209a5fe1a8d
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
+from cugraph.utilities.utils import import_optional
+
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+nn = import_optional("torch.nn")
+ops_torch = import_optional("pylibcugraphops.pytorch")
+
+
+class GATv2Conv(BaseConv):
+    r"""GATv2 from `How Attentive are Graph Attention Networks?
+    <https://arxiv.org/pdf/2105.14491.pdf>`__, with the sparse aggregation
+    accelerated by cugraph-ops.
+
+    Parameters
+    ----------
+    in_feats : int, or pair of ints
+        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
+        If the layer is to be applied to a unidirectional bipartite graph, `in_feats`
+        specifies the input feature size on both the source and destination nodes.
+        If a scalar is given, the source and destination node feature size
+        would take the same value.
+    out_feats : int
+        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
+    num_heads : int
+        Number of heads in Multi-Head Attention.
+    feat_drop : float, optional
+        Dropout rate on feature. Defaults: ``0``.
+    concat : bool, optional
+        If False, the multi-head attentions are averaged instead of concatenated.
+        Default: ``True``.
+    edge_feats : int, optional
+        Edge feature size. Default: ``None``.
+    negative_slope : float, optional
+        LeakyReLU angle of negative slope. Defaults: ``0.2``.
+    residual : bool, optional
+        If True, use residual connection. Defaults: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will
+        be invalid since no message will be passed to those nodes. This is
+        harmful for some applications causing silent performance regression.
+        This module will raise a DGLError if it detects 0-in-degree nodes in
+        input graph. By setting ``True``, it will suppress the check and let the
+        users handle it by themselves. Defaults: ``False``.
+    bias : bool, optional
+        If set to :obj:`False`, the layer will not learn
+        an additive bias. (default: :obj:`True`)
+    share_weights : bool, optional
+        If set to :obj:`True`, the same matrix for :math:`W_{left}` and
+        :math:`W_{right}` in the above equations, will be applied to the source
+        and the target node of every edge. (default: :obj:`False`)
+    """
+
+    def __init__(
+        self,
+        in_feats: Union[int, Tuple[int, int]],
+        out_feats: int,
+        num_heads: int,
+        feat_drop: float = 0.0,
+        concat: bool = True,
+        edge_feats: Optional[int] = None,
+        negative_slope: float = 0.2,
+        residual: bool = False,
+        allow_zero_in_degree: bool = False,
+        bias: bool = True,
+        share_weights: bool = False,
+    ):
+        super().__init__()
+        self.in_feats = in_feats
+        self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
+        self.num_heads = num_heads
+        self.feat_drop = nn.Dropout(feat_drop)
+        self.concat = concat
+        self.edge_feats = edge_feats
+        self.negative_slope = negative_slope
+        self.allow_zero_in_degree = allow_zero_in_degree
+        self.share_weights = share_weights
+
+        self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
+        if share_weights:
+            if self.in_feats_src != self.in_feats_dst:
+                raise ValueError(
+                    f"Input feature size of source and destination "
+                    f"nodes must be identical when share_weights is enabled, "
+                    f"but got {self.in_feats_src} and {self.in_feats_dst}."
+                )
+            self.lin_dst = self.lin_src
+        else:
+            self.lin_dst = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+
+        self.attn = nn.Parameter(torch.Tensor(num_heads * out_feats))
+
+        if edge_feats is not None:
+            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
+        else:
+            self.register_parameter("lin_edge", None)
+
+        if bias and concat:
+            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
+        elif bias and not concat:
+            self.bias = nn.Parameter(torch.Tensor(out_feats))
+        else:
+            self.register_buffer("bias", None)
+
+        self.residual = residual and self.in_feats_dst != out_feats * num_heads
+        if self.residual:
+            self.lin_res = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+        else:
+            self.register_buffer("lin_res", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        r"""Reinitialize learnable parameters."""
+        gain = nn.init.calculate_gain("relu")
+        nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
+        nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
+
+        nn.init.xavier_normal_(
+            self.attn.view(-1, self.num_heads, self.out_feats), gain=gain
+        )
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        if self.lin_res is not None:
+            self.lin_res.reset_parameters()
+
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+
+    def forward(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        efeat: Optional[torch.Tensor] = None,
+        max_in_degree: Optional[int] = None,
+    ) -> torch.Tensor:
+        r"""Forward computation.
+
+        Parameters
+        ----------
+        graph : DGLGraph or SparseGraph
+            The graph.
+        nfeat : torch.Tensor
+            Input features of shape :math:`(N, D_{in})`.
+        efeat: torch.Tensor, optional
+            Optional edge features.
+        max_in_degree : int
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, H, D_{out})` where
+            :math:`H` is the number of heads, and :math:`D_{out}` is size of
+            output feature.
+        """
+
+        if isinstance(g, dgl.DGLHeteroGraph):
+            if not self.allow_zero_in_degree:
+                if (g.in_degrees() == 0).any():
+                    raise dgl.base.DGLError(
+                        "There are 0-in-degree nodes in the graph, "
+                        "output for those nodes will be invalid. "
+                        "This is harmful for some applications, "
+                        "causing silent performance regression. "
+                        "Adding self-loop on the input graph by "
+                        "calling `g = dgl.add_self_loop(g)` will resolve "
+                        "the issue. Setting ``allow_zero_in_degree`` "
+                        "to be `True` when constructing this module will "
+                        "suppress the check and let the code run."
+                    )
+
+        nfeat_bipartite = isinstance(nfeat, (list, tuple))
+        graph_bipartite = nfeat_bipartite or self.share_weights is False
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
+        )
+
+        if nfeat_bipartite:
+            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
+            nfeat_dst_orig = nfeat[1]
+        else:
+            nfeat = self.feat_drop(nfeat)
+            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
+
+        if efeat is not None:
+            if self.lin_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_feats must be set to "
+                    f"accept edge features."
+                )
+            efeat = self.lin_edge(efeat)
+
+        if nfeat_bipartite:
+            nfeat = (self.lin_src(nfeat[0]), self.lin_dst(nfeat[1]))
+        elif graph_bipartite:
+            nfeat = (self.lin_src(nfeat), self.lin_dst(nfeat[: g.num_dst_nodes()]))
+        else:
+            nfeat = self.lin_src(nfeat)
+
+        out = ops_torch.operators.mha_gat_v2_n2n(
+            nfeat,
+            self.attn,
+            _graph,
+            num_heads=self.num_heads,
+            activation="LeakyReLU",
+            negative_slope=self.negative_slope,
+            concat_heads=self.concat,
+            edge_feat=efeat,
+        )[: g.num_dst_nodes()]
+
+        if self.concat:
+            out = out.view(-1, self.num_heads, self.out_feats)
+
+        if self.residual:
+            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
+            if not self.concat:
+                res = res.mean(dim=1)
+            out = out + res
+
+        if self.bias is not None:
+            out = out + self.bias
+
+        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index 89e49011cf7..54916674210 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -10,14 +10,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for Relational graph convolution layer using the aggregation
-primitives in cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
+
 import math
-from typing import Optional
+from typing import Optional, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -29,13 +26,8 @@
 class RelGraphConv(BaseConv):
     r"""An accelerated relational graph convolution layer from `Modeling
     Relational Data with Graph Convolutional Networks
-    <https://arxiv.org/abs/1703.06103>`__ that leverages the highly-optimized
-    aggregation primitives in cugraph-ops.
-
-    See :class:`dgl.nn.pytorch.conv.RelGraphConv` for mathematical model.
-
-    This module depends on :code:`pylibcugraphops` package, which can be
-    installed via :code:`conda install -c nvidia pylibcugraphops>=23.02`.
+    <https://arxiv.org/abs/1703.06103>`__, with the sparse aggregation
+    accelerated by cugraph-ops.
 
     Parameters
     ----------
@@ -84,7 +76,6 @@ class RelGraphConv(BaseConv):
             [-1.4335, -2.3758],
             [-1.4331, -2.3295]], device='cuda:0', grad_fn=<AddBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 500
 
     def __init__(
         self,
@@ -148,7 +139,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         feat: torch.Tensor,
         etypes: torch.Tensor,
         max_in_degree: Optional[int] = None,
@@ -167,49 +158,24 @@ def forward(
             so any input of other integer types will be casted into int32,
             thus introducing some overhead. Pass in int32 tensors directly
             for best performance.
-        max_in_degree : int, optional
-            Maximum in-degree of destination nodes. It is only effective when
-            :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
-            :attr:`g` is generated from a neighbor sampler, the value should be
-            set to the corresponding :attr:`fanout`. If not given,
-            :attr:`max_in_degree` will be calculated on-the-fly.
+        max_in_degree : int
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
 
         Returns
         -------
         torch.Tensor
             New node features. Shape: :math:`(|V|, D_{out})`.
         """
-        offsets, indices, edge_ids = g.adj_tensors("csc")
-        edge_types_perm = etypes[edge_ids.long()].int()
-
-        if g.is_block:
-            if max_in_degree is None:
-                max_in_degree = g.in_degrees().max().item()
-
-            if max_in_degree < self.MAX_IN_DEGREE_MFG:
-                _graph = ops_torch.SampledHeteroCSC(
-                    offsets,
-                    indices,
-                    edge_types_perm,
-                    max_in_degree,
-                    g.num_src_nodes(),
-                    self.num_rels,
-                )
-            else:
-                offsets_fg = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-                _graph = ops_torch.StaticHeteroCSC(
-                    offsets_fg,
-                    indices,
-                    edge_types_perm,
-                    self.num_rels,
-                )
-        else:
-            _graph = ops_torch.StaticHeteroCSC(
-                offsets,
-                indices,
-                edge_types_perm,
-                self.num_rels,
-            )
+        _graph = self.get_cugraph_ops_HeteroCSC(
+            g,
+            num_edge_types=self.num_rels,
+            etypes=etypes,
+            is_bipartite=False,
+            max_in_degree=max_in_degree,
+        )
 
         h = ops_torch.operators.agg_hg_basis_n2n_post(
             feat,
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index 60f4c505e19..a3f946d7cb4 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -10,11 +10,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for GraphSAGE layer using the aggregation primitives in
-cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
-from typing import Optional, Union
+
+from typing import Optional, Tuple, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -27,22 +24,18 @@
 
 class SAGEConv(BaseConv):
     r"""An accelerated GraphSAGE layer from `Inductive Representation Learning
-    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`__ that leverages the
-    highly-optimized aggregation primitives in cugraph-ops.
-
-    See :class:`dgl.nn.pytorch.conv.SAGEConv` for mathematical model.
-
-    This module depends on :code:`pylibcugraphops` package, which can be
-    installed via :code:`conda install -c nvidia pylibcugraphops>=23.02`.
+    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`, with the sparse
+    aggregation accelerated by cugraph-ops.
 
     Parameters
     ----------
-    in_feats : int
-        Input feature size.
+    in_feats : int or tuple
+        Input feature size. If a scalar is given, the source and destination
+        nodes are required to be the same.
     out_feats : int
         Output feature size.
     aggregator_type : str
-        Aggregator type to use (``mean``, ``sum``, ``min``, ``max``).
+        Aggregator type to use ("mean", "sum", "min", "max", "pool", "gcn").
     feat_drop : float
         Dropout rate on features, default: ``0``.
     bias : bool
@@ -68,38 +61,57 @@ class SAGEConv(BaseConv):
             [-1.1690,  0.1952],
             [-1.1690,  0.1952]], device='cuda:0', grad_fn=<AddmmBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 500
+    valid_aggr_types = {"mean", "sum", "min", "max", "pool", "gcn"}
 
     def __init__(
         self,
-        in_feats: int,
+        in_feats: Union[int, Tuple[int, int]],
         out_feats: int,
         aggregator_type: str = "mean",
         feat_drop: float = 0.0,
         bias: bool = True,
     ):
         super().__init__()
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        valid_aggr_types = {"max", "min", "mean", "sum"}
-        if aggregator_type not in valid_aggr_types:
+
+        if aggregator_type not in self.valid_aggr_types:
             raise ValueError(
-                f"Invalid aggregator_type. Must be one of {valid_aggr_types}. "
+                f"Invalid aggregator_type. Must be one of {self.valid_aggr_types}. "
                 f"But got '{aggregator_type}' instead."
             )
-        self.aggr = aggregator_type
+
+        self.aggregator_type = aggregator_type
+        self._aggr = aggregator_type
+        self.in_feats = in_feats
+        self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.feat_drop = nn.Dropout(feat_drop)
 
-        self.linear = nn.Linear(2 * in_feats, out_feats, bias=bias)
+        if self.aggregator_type == "gcn":
+            self._aggr = "mean"
+            self.lin = nn.Linear(self.in_feats_src, out_feats, bias=bias)
+        else:
+            self.lin = nn.Linear(
+                self.in_feats_src + self.in_feats_dst, out_feats, bias=bias
+            )
+
+        if self.aggregator_type == "pool":
+            self._aggr = "max"
+            self.pre_lin = nn.Linear(self.in_feats_src, self.in_feats_src)
+        else:
+            self.register_parameter("pre_lin", None)
+
+        self.reset_parameters()
 
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
-        self.linear.reset_parameters()
+        self.lin.reset_parameters()
+        if self.pre_lin is not None:
+            self.pre_lin.reset_parameters()
 
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: torch.Tensor,
+        feat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
@@ -108,7 +120,7 @@ def forward(
         ----------
         g : DGLGraph or SparseGraph
             The graph.
-        feat : torch.Tensor
+        feat : torch.Tensor or tuple
             Node features. Shape: :math:`(|V|, D_{in})`.
         max_in_degree : int
             Maximum in-degree of destination nodes. When :attr:`g` is generated
@@ -121,36 +133,34 @@ def forward(
         torch.Tensor
             Output node features. Shape: :math:`(|V|, D_{out})`.
         """
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            assert "csc" in g.formats()
-            offsets, indices = g.csc()
-            _graph = ops_torch.CSC(
-                offsets=offsets,
-                indices=indices,
-                num_src_nodes=g.num_src_nodes(),
-                dst_max_in_degree=max_in_degree,
-            )
-        elif isinstance(g, dgl.DGLHeteroGraph):
-            offsets, indices, _ = g.adj_tensors("csc")
-            _graph = ops_torch.CSC(
-                offsets=offsets,
-                indices=indices,
-                num_src_nodes=g.num_src_nodes(),
-                dst_max_in_degree=max_in_degree,
-            )
-        else:
-            raise TypeError(
-                f"The graph has to be either a 'SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
+        feat_bipartite = isinstance(feat, (list, tuple))
+        graph_bipartite = feat_bipartite or self.aggregator_type == "pool"
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
+        )
 
-        feat = self.feat_drop(feat)
-        h = ops_torch.operators.agg_concat_n2n(feat, _graph, self.aggr)[
+        if feat_bipartite:
+            feat = (self.feat_drop(feat[0]), self.feat_drop(feat[1]))
+        else:
+            feat = self.feat_drop(feat)
+
+        if self.aggregator_type == "pool":
+            if feat_bipartite:
+                feat = (self.pre_lin(feat[0]).relu(), feat[1])
+            else:
+                feat = (self.pre_lin(feat).relu(), feat[: g.num_dst_nodes()])
+            # force ctx.needs_input_grad=True in cugraph-ops autograd function
+            feat[0].requires_grad_()
+            feat[1].requires_grad_()
+
+        out = ops_torch.operators.agg_concat_n2n(feat, _graph, self._aggr)[
             : g.num_dst_nodes()
         ]
-        h = self.linear(h)
 
-        return h
+        if self.aggregator_type == "gcn":
+            out = out[:, : self.in_feats_src]
+
+        out = self.lin(out)
+
+        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 5cd5fbbaebe..8481b9ee265 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -10,9 +10,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Optional, Tuple, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -114,7 +115,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -130,17 +131,12 @@ def forward(
         efeat: torch.Tensor, optional
             Edge feature tensor. Default: ``None``.
         """
-        offsets, indices, _ = g.adj_tensors("csc")
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            is_bipartite=True,
-        )
-
-        if isinstance(nfeat, torch.Tensor):
+        feat_bipartite = isinstance(nfeat, (list, tuple))
+        if not feat_bipartite:
             nfeat = (nfeat, nfeat)
 
+        _graph = self.get_cugraph_ops_CSC(g, is_bipartite=True)
+
         query = self.lin_query(nfeat[1][: g.num_dst_nodes()])
         key = self.lin_key(nfeat[0])
         value = self.lin_value(nfeat[0])
@@ -157,7 +153,7 @@ def forward(
             key_emb=key,
             query_emb=query,
             value_emb=value,
-            graph=graph,
+            graph=_graph,
             num_heads=self.num_heads,
             concat_heads=self.concat,
             edge_emb=efeat,
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index 6f8690d1140..a3863ed81fa 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -40,16 +40,19 @@ class SparseGraphData1:
     nnz = 6
     src_ids = torch.IntTensor([0, 1, 2, 3, 2, 5]).cuda()
     dst_ids = torch.IntTensor([1, 2, 3, 4, 0, 3]).cuda()
+    values = torch.IntTensor([10, 20, 30, 40, 50, 60]).cuda()
 
     # CSR
     src_ids_sorted_by_src = torch.IntTensor([0, 1, 2, 2, 3, 5]).cuda()
     dst_ids_sorted_by_src = torch.IntTensor([1, 2, 0, 3, 4, 3]).cuda()
     csrc_ids = torch.IntTensor([0, 1, 2, 4, 5, 5, 6]).cuda()
+    values_csr = torch.IntTensor([10, 20, 50, 30, 40, 60]).cuda()
 
     # CSC
     src_ids_sorted_by_dst = torch.IntTensor([2, 0, 1, 5, 2, 3]).cuda()
     dst_ids_sorted_by_dst = torch.IntTensor([0, 1, 2, 3, 3, 4]).cuda()
     cdst_ids = torch.IntTensor([0, 1, 2, 3, 5, 6]).cuda()
+    values_csc = torch.IntTensor([50, 10, 20, 60, 30, 40]).cuda()
 
 
 @pytest.fixture
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index 7ed65645a28..ef3047dc2cd 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -10,69 +10,84 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import GATConv as CuGraphGATConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
-def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_block):
-    GATConv = dgl.nn.GATConv
-    CuGraphGATConv = cugraph_dgl.nn.GATConv
-    device = "cuda"
-    g = create_graph1().to(device)
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_gatconv_equality(
+    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+):
+    from dgl.nn.pytorch import GATConv
+
+    g = create_graph1().to("cuda")
 
     if idtype_int:
         g = g.int()
-
     if to_block:
         g = dgl.to_block(g)
 
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
     out_feats = 2
 
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False}
+    kwargs = {"bias": False, "allow_zero_in_degree": True}
 
-    conv1 = GATConv(*args, **kwargs, allow_zero_in_degree=True).to(device)
+    conv1 = GATConv(*args, **kwargs).cuda()
     out1 = conv1(g, nfeat)
 
-    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATConv(*args, **kwargs).cuda()
     dim = num_heads * out_feats
     with torch.no_grad():
         conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
         conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
         if bipartite:
-            conv2.fc_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-            conv2.fc_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+            conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
+            conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
         else:
-            conv2.fc.weight.data = conv1.fc.weight.data.detach().clone()
-    out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+            conv2.lin.weight.data = conv1.fc.weight.data.detach().clone()
+        if residual and conv2.residual:
+            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
 
-    assert torch.allclose(out1, out2, atol=1e-6)
+    if sparse_format is not None:
+        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out1 = torch.rand_like(out1)
     grad_out2 = grad_out1.clone().detach()
@@ -81,18 +96,18 @@ def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_bl
 
     if bipartite:
         assert torch.allclose(
-            conv1.fc_src.weight.grad, conv2.fc_src.weight.grad, atol=1e-6
+            conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
         assert torch.allclose(
-            conv1.fc_dst.weight.grad, conv2.fc_dst.weight.grad, atol=1e-6
+            conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
         )
     else:
-        assert torch.allclose(conv1.fc.weight.grad, conv2.fc.weight.grad, atol=1e-6)
+        assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
         conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=1e-6,
+        atol=ATOL,
     )
 
 
@@ -106,10 +121,7 @@ def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_bl
 def test_gatconv_edge_feats(
     bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
 ):
-    from cugraph_dgl.nn import GATConv
-
-    device = "cuda"
-    g = create_graph1().to(device)
+    g = create_graph1().to("cuda")
 
     if to_block:
         g = dgl.to_block(g)
@@ -117,24 +129,30 @@ def test_gatconv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
+        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
     else:
         edge_feats = None
         efeat = None
 
-    conv = GATConv(
-        in_feats, out_feats, num_heads, concat=concat, edge_feats=edge_feats, bias=bias
-    ).to(device)
+    conv = CuGraphGATConv(
+        in_feats,
+        out_feats,
+        num_heads,
+        concat=concat,
+        edge_feats=edge_feats,
+        bias=bias,
+        allow_zero_in_degree=True,
+    ).cuda()
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
     grad_out = torch.rand_like(out)
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
new file mode 100644
index 00000000000..cc46a6e4b39
--- /dev/null
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
+from .common import create_graph1
+
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
+
+
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("residual", [False, True])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_gatv2conv_equality(
+    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+):
+    from dgl.nn.pytorch import GATv2Conv
+
+    g = create_graph1().to("cuda")
+
+    if idtype_int:
+        g = g.int()
+    if to_block:
+        g = dgl.to_block(g)
+
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+    out_feats = 2
+
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
+    args = (in_feats, out_feats, num_heads)
+    kwargs = {"bias": False, "allow_zero_in_degree": True}
+
+    conv1 = GATv2Conv(*args, **kwargs).cuda()
+    out1 = conv1(g, nfeat)
+
+    conv2 = CuGraphGATv2Conv(*args, **kwargs).cuda()
+    with torch.no_grad():
+        conv2.attn.data = conv1.attn.data.flatten()
+        conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
+        conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        if residual and conv2.residual:
+            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+
+    if sparse_format is not None:
+        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
+
+    grad_out1 = torch.rand_like(out1)
+    grad_out2 = grad_out1.clone().detach()
+    out1.backward(grad_out1)
+    out2.backward(grad_out2)
+
+    assert torch.allclose(
+        conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
+    )
+    assert torch.allclose(
+        conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
+    )
+
+    assert torch.allclose(conv1.attn.grad, conv1.attn.grad, atol=ATOL)
+
+
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("concat", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("use_edge_feats", [False, True])
+def test_gatv2conv_edge_feats(
+    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+):
+    g = create_graph1().to("cuda")
+
+    if to_block:
+        g = dgl.to_block(g)
+
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+    out_feats = 2
+
+    if use_edge_feats:
+        edge_feats = 3
+        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+    else:
+        edge_feats = None
+        efeat = None
+
+    conv = CuGraphGATv2Conv(
+        in_feats,
+        out_feats,
+        num_heads,
+        concat=concat,
+        edge_feats=edge_feats,
+        bias=bias,
+        allow_zero_in_degree=True,
+    ).cuda()
+    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
+
+    grad_out = torch.rand_like(out)
+    out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index d2ae6a23978..901f9ba1433 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -10,20 +10,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("idtype_int", [False, True])
@@ -32,12 +29,17 @@
 @pytest.mark.parametrize("regularizer", [None, "basis"])
 @pytest.mark.parametrize("self_loop", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_relgraphconv_equality(
-    idtype_int, max_in_degree, num_bases, regularizer, self_loop, to_block
+    idtype_int,
+    max_in_degree,
+    num_bases,
+    regularizer,
+    self_loop,
+    to_block,
+    sparse_format,
 ):
-    RelGraphConv = dgl.nn.RelGraphConv
-    CuGraphRelGraphConv = cugraph_dgl.nn.RelGraphConv
-    device = "cuda"
+    from dgl.nn.pytorch import RelGraphConv
 
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
@@ -47,34 +49,57 @@ def test_relgraphconv_equality(
         "bias": False,
         "self_loop": self_loop,
     }
-    g = create_graph1().to(device)
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
+    g = create_graph1().to("cuda")
+    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).cuda()
+
     if idtype_int:
         g = g.int()
     if to_block:
         g = dgl.to_block(g)
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
+
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+    feat = torch.rand(g.num_src_nodes(), in_feat).cuda()
+
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size,
+            src_ids=g.edges()[0],
+            dst_ids=g.edges()[1],
+            values=g.edata[dgl.ETYPE],
+            formats="csc",
+        )
+    elif sparse_format == "csc":
+        offsets, indices, perm = g.adj_tensors("csc")
+        etypes = g.edata[dgl.ETYPE][perm]
+        sg = SparseGraph(
+            size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
+        )
 
     torch.manual_seed(0)
-    conv1 = RelGraphConv(*args, **kwargs).to(device)
+    conv1 = RelGraphConv(*args, **kwargs).cuda()
 
     torch.manual_seed(0)
     kwargs["apply_norm"] = False
-    conv2 = CuGraphRelGraphConv(*args, **kwargs).to(device)
+    conv2 = CuGraphRelGraphConv(*args, **kwargs).cuda()
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
-    out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=1e-06)
+
+    if sparse_format is not None:
+        out2 = conv2(sg, feat, sg.values(), max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
 
     end = -1 if self_loop else None
-    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=1e-6)
+    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=ATOL)
 
     if self_loop:
-        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=1e-6)
+        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=ATOL)
 
     if regularizer is not None:
-        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=1e-6)
+        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index 447bbe49460..e2acf9e6596 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -10,31 +10,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-from cugraph.utilities.utils import import_optional
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
 
+ATOL = 1e-6
 
+
+@pytest.mark.parametrize("aggr", ["mean", "pool"])
 @pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_format):
-    SAGEConv = dgl.nn.SAGEConv
-    device = "cuda"
+def test_sageconv_equality(
+    aggr, bias, bipartite, idtype_int, max_in_degree, to_block, sparse_format
+):
+    from dgl.nn.pytorch import SAGEConv
 
-    in_feat, out_feat = 5, 2
-    kwargs = {"aggregator_type": "mean", "bias": bias}
-    g = create_graph1().to(device)
+    kwargs = {"aggregator_type": aggr, "bias": bias}
+    g = create_graph1().to("cuda")
 
     if idtype_int:
         g = g.int()
@@ -42,7 +44,17 @@ def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_for
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
+
+    if bipartite:
+        in_feats = (5, 3)
+        feat = (
+            torch.rand(size[0], in_feats[0], requires_grad=True).cuda(),
+            torch.rand(size[1], in_feats[1], requires_grad=True).cuda(),
+        )
+    else:
+        in_feats = 5
+        feat = torch.rand(size[0], in_feats).cuda()
+    out_feats = 2
 
     if sparse_format == "coo":
         sg = SparseGraph(
@@ -52,39 +64,38 @@ def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_for
         offsets, indices, _ = g.adj_tensors("csc")
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
-    torch.manual_seed(0)
-    conv1 = SAGEConv(in_feat, out_feat, **kwargs).to(device)
-
-    torch.manual_seed(0)
-    conv2 = CuGraphSAGEConv(in_feat, out_feat, **kwargs).to(device)
+    conv1 = SAGEConv(in_feats, out_feats, **kwargs).cuda()
+    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).cuda()
 
+    in_feats_src = conv2.in_feats_src
     with torch.no_grad():
-        conv2.linear.weight.data[:, :in_feat] = conv1.fc_neigh.weight.data
-        conv2.linear.weight.data[:, in_feat:] = conv1.fc_self.weight.data
+        conv2.lin.weight.data[:, :in_feats_src] = conv1.fc_neigh.weight.data
+        conv2.lin.weight.data[:, in_feats_src:] = conv1.fc_self.weight.data
         if bias:
-            conv2.linear.bias.data[:] = conv1.fc_self.bias.data
+            conv2.lin.bias.data[:] = conv1.fc_self.bias.data
+        if aggr == "pool":
+            conv2.pre_lin.weight.data[:] = conv1.fc_pool.weight.data
+            conv2.pre_lin.bias.data[:] = conv1.fc_pool.bias.data
 
     out1 = conv1(g, feat)
     if sparse_format is not None:
         out2 = conv2(sg, feat, max_in_degree=max_in_degree)
     else:
         out2 = conv2(g, feat, max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=1e-06)
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
     assert torch.allclose(
         conv1.fc_neigh.weight.grad,
-        conv2.linear.weight.grad[:, :in_feat],
-        atol=1e-6,
+        conv2.lin.weight.grad[:, :in_feats_src],
+        atol=ATOL,
     )
     assert torch.allclose(
         conv1.fc_self.weight.grad,
-        conv2.linear.weight.grad[:, in_feat:],
-        atol=1e-6,
+        conv2.lin.weight.grad[:, in_feats_src:],
+        atol=ATOL,
     )
     if bias:
-        assert torch.allclose(
-            conv1.fc_self.bias.grad, conv2.linear.bias.grad, atol=1e-6
-        )
+        assert torch.allclose(conv1.fc_self.bias.grad, conv2.lin.bias.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
index 3fb01575d66..09c0df202ff 100644
--- a/python/cugraph-dgl/tests/nn/test_sparsegraph.py
+++ b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
@@ -19,32 +19,42 @@
 
 def test_coo2csc(sparse_graph_1):
     data = sparse_graph_1
-    values = torch.ones(data.nnz).cuda()
+
     g = SparseGraph(
-        size=data.size, src_ids=data.src_ids, dst_ids=data.dst_ids, formats="csc"
+        size=data.size,
+        src_ids=data.src_ids,
+        dst_ids=data.dst_ids,
+        values=data.values,
+        formats=["csc"],
     )
-    cdst_ids, src_ids = g.csc()
+    cdst_ids, src_ids, values = g.csc()
 
     new = torch.sparse_csc_tensor(cdst_ids, src_ids, values).cuda()
     old = torch.sparse_coo_tensor(
-        torch.vstack((data.src_ids, data.dst_ids)), values
+        torch.vstack((data.src_ids, data.dst_ids)), data.values
     ).cuda()
     torch.allclose(new.to_dense(), old.to_dense())
 
 
-def test_csc2coo(sparse_graph_1):
+def test_csc_input(sparse_graph_1):
     data = sparse_graph_1
-    values = torch.ones(data.nnz).cuda()
+
     g = SparseGraph(
         size=data.size,
         src_ids=data.src_ids_sorted_by_dst,
         cdst_ids=data.cdst_ids,
-        formats="coo",
+        values=data.values_csc,
+        formats=["coo", "csc", "csr"],
     )
-    src_ids, dst_ids = g.coo()
+    src_ids, dst_ids, values = g.coo()
 
     new = torch.sparse_coo_tensor(torch.vstack((src_ids, dst_ids)), values).cuda()
     old = torch.sparse_csc_tensor(
-        data.cdst_ids, data.src_ids_sorted_by_dst, values
+        data.cdst_ids, data.src_ids_sorted_by_dst, data.values_csc
     ).cuda()
     torch.allclose(new.to_dense(), old.to_dense())
+
+    csrc_ids, dst_ids, values = g.csr()
+
+    new = torch.sparse_csr_tensor(csrc_ids, dst_ids, values).cuda()
+    torch.allclose(new.to_dense(), old.to_dense())
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 00476b9f0bb..b2b69cb35ab 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -13,16 +13,14 @@
 
 import pytest
 
-try:
-    from cugraph_dgl.nn import TransformerConv
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import TransformerConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("beta", [False, True])
@@ -32,8 +30,16 @@
 @pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_TransformerConv(
-    beta, bipartite_node_feats, concat, idtype_int, num_heads, to_block, use_edge_feats
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_transformerconv(
+    beta,
+    bipartite_node_feats,
+    concat,
+    idtype_int,
+    num_heads,
+    to_block,
+    use_edge_feats,
+    sparse_format,
 ):
     device = "cuda"
     g = create_graph1().to(device)
@@ -44,6 +50,15 @@ def test_TransformerConv(
     if to_block:
         g = dgl.to_block(g)
 
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
     if bipartite_node_feats:
         in_node_feats = (5, 3)
         nfeat = (
@@ -71,6 +86,10 @@ def test_TransformerConv(
         edge_feats=edge_feats,
     ).to(device)
 
-    out = conv(g, nfeat, efeat)
+    if sparse_format is not None:
+        out = conv(sg, nfeat, efeat)
+    else:
+        out = conv(g, nfeat, efeat)
+
     grad_out = torch.rand_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/test_dataset.py b/python/cugraph-dgl/tests/test_dataset.py
index 69d50261e55..5db443dc0d8 100644
--- a/python/cugraph-dgl/tests/test_dataset.py
+++ b/python/cugraph-dgl/tests/test_dataset.py
@@ -123,6 +123,6 @@ def test_homogeneous_sampled_graphs_from_dataframe(return_type, seed_node):
             assert dgl_block.num_src_nodes() == cugraph_dgl_graph.num_src_nodes()
             assert dgl_block.num_dst_nodes() == cugraph_dgl_graph.num_dst_nodes()
             dgl_offsets, dgl_indices, _ = dgl_block.adj_tensors("csc")
-            cugraph_offsets, cugraph_indices = cugraph_dgl_graph.csc()
+            cugraph_offsets, cugraph_indices, _ = cugraph_dgl_graph.csc()
             assert torch.equal(dgl_offsets.to("cpu"), cugraph_offsets.to("cpu"))
             assert torch.equal(dgl_indices.to("cpu"), cugraph_indices.to("cpu"))
diff --git a/python/cugraph-dgl/tests/test_from_dgl_hetrograph.py b/python/cugraph-dgl/tests/test_from_dgl_heterograph.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_from_dgl_hetrograph.py
rename to python/cugraph-dgl/tests/test_from_dgl_heterograph.py