From 4ee227c7c84ef487828ecadd5fe86934f1fce4eb Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Thu, 7 Sep 2023 16:39:22 -0500
Subject: [PATCH 01/22] Remove the assumption made on the client data's keys
 (#3835)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When calling `client.has_what(`) which returns the data's key that are held in each worker’s memory, those keys used to be returned as string but a recent change in `dask` changed the type to tuples
 
From `{worker_ip_address: ("('from-delayed-190587f1b2318dc54d5f92a79e59b71a', 0)", "('from-delayed-190587f1b2318dc54d5f92a79e59b71a', 1)")}` to`{worker_ip_address: (('from-delayed-c3d92b2cc9948634e82a0b2b62453a6c', 0), ('from-delayed-c3d92b2cc9948634e82a0b2b62453a6c', 1))}`
 
When mapping workers to persisted data in the function `get_persisted_df_worker_map`, an assumption about the type of those keys was made thereby breaking our MG tests.

This PR removes that assumption.
Closes #3834

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3835
---
 .../cugraph/cugraph/dask/common/part_utils.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/cugraph/cugraph/dask/common/part_utils.py b/python/cugraph/cugraph/dask/common/part_utils.py
index fda7e257367..7c0aad6c3ee 100644
--- a/python/cugraph/cugraph/dask/common/part_utils.py
+++ b/python/cugraph/cugraph/dask/common/part_utils.py
@@ -73,7 +73,7 @@ def persist_distributed_data(dask_df, client):
     _keys = dask_df.__dask_keys__()
     worker_dict = {}
     for i, key in enumerate(_keys):
-        worker_dict[str(key)] = tuple([worker_addresses[i]])
+        worker_dict[key] = tuple([worker_addresses[i]])
     persisted = client.persist(dask_df, workers=worker_dict)
     parts = futures_of(persisted)
     return parts
@@ -89,7 +89,7 @@ def get_persisted_df_worker_map(dask_df, client):
     ddf_keys = futures_of(dask_df)
     output_map = {}
     for w, w_keys in client.has_what().items():
-        output_map[w] = [ddf_k for ddf_k in ddf_keys if str(ddf_k.key) in w_keys]
+        output_map[w] = [ddf_k for ddf_k in ddf_keys if ddf_k.key in w_keys]
         if len(output_map[w]) == 0:
             output_map[w] = _create_empty_dask_df_future(dask_df._meta, client, w)
     return output_map
@@ -157,7 +157,7 @@ async def _extract_partitions(
         # NOTE: We colocate (X, y) here by zipping delayed
         # n partitions of them as (X1, y1), (X2, y2)...
         # and asking client to compute a single future for
-        # each tuple in the list
+        # each tuple in the list.
         dela = [np.asarray(d.to_delayed()) for d in dask_obj]
 
         # TODO: ravel() is causing strange behavior w/ delayed Arrays which are
@@ -167,7 +167,7 @@ async def _extract_partitions(
         parts = client.compute([p for p in zip(*raveled)])
 
     await wait(parts)
-    key_to_part = [(str(part.key), part) for part in parts]
+    key_to_part = [(part.key, part) for part in parts]
     who_has = await client.who_has(parts)
     return [(first(who_has[key]), part) for key, part in key_to_part]
 
@@ -229,7 +229,7 @@ def load_balance_func(ddf_, by, client=None):
     wait(parts)
 
     who_has = client.who_has(parts)
-    key_to_part = [(str(part.key), part) for part in parts]
+    key_to_part = [(part.key, part) for part in parts]
     gpu_fututres = [
         (first(who_has[key]), part.key[1], part) for key, part in key_to_part
     ]
@@ -245,7 +245,7 @@ def load_balance_func(ddf_, by, client=None):
     for cumsum in cumsum_parts:
         num_rows.append(cumsum.iloc[-1])
 
-    # Calculate current partition divisions
+    # Calculate current partition divisions.
     divisions = [sum(num_rows[0:x:1]) for x in range(0, len(num_rows) + 1)]
     divisions[-1] = divisions[-1] - 1
     divisions = tuple(divisions)
@@ -271,7 +271,7 @@ def load_balance_func(ddf_, by, client=None):
 
 def concat_dfs(df_list):
     """
-    Concat a list of cudf dataframes
+    Concat a list of cudf dataframes.
     """
     return cudf.concat(df_list)
 
@@ -279,17 +279,17 @@ def concat_dfs(df_list):
 def get_delayed_dict(ddf):
     """
     Returns a dicitionary with the dataframe tasks as keys and
-    the dataframe delayed objects as values
+    the dataframe delayed objects as values.
     """
     df_delayed = {}
     for delayed_obj in ddf.to_delayed():
-        df_delayed[str(delayed_obj.key)] = delayed_obj
+        df_delayed[delayed_obj.key] = delayed_obj
     return df_delayed
 
 
 def concat_within_workers(client, ddf):
     """
-    Concats all partitions within workers without transfers
+    Concats all partitions within workers without transfers.
     """
     df_delayed = get_delayed_dict(ddf)
 

From 6779e896edf310f5bcaad5acb8673995041c2801 Mon Sep 17 00:00:00 2001
From: ralph <137829296+nv-rliu@users.noreply.github.com>
Date: Fri, 8 Sep 2023 09:58:44 -0400
Subject: [PATCH 02/22] Adding metadata getter methods to datasets API (#3821)

Closes #3820

This PR adds simple getter methods to the `dataset` class, which allows users to easily get information about datasets without need to access the `metadata` dict or look in the directory.

```python
from cugraph.datasets import karate

# users now call
karate.number_of_nodes()

# instead of
karate.metadata['number_of_nodes']
```

Authors:
  - ralph (https://github.com/nv-rliu)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3821
---
 python/cugraph/cugraph/datasets/dataset.py    | 36 +++++++++++++++++++
 .../cugraph/tests/utils/test_dataset.py       | 10 ++++++
 2 files changed, 46 insertions(+)

diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py
index b276a87b88e..877eade7708 100644
--- a/python/cugraph/cugraph/datasets/dataset.py
+++ b/python/cugraph/cugraph/datasets/dataset.py
@@ -266,6 +266,42 @@ def get_path(self):
 
         return self._path.absolute()
 
+    def is_directed(self):
+        """
+        Returns True if the graph is a directed graph.
+        """
+        return self.metadata["is_directed"]
+
+    def is_multigraph(self):
+        """
+        Returns True if the graph is a multigraph.
+        """
+        return self.metadata["is_multigraph"]
+
+    def is_symmetric(self):
+        """
+        Returns True if the graph is symmetric.
+        """
+        return self.metadata["is_symmetric"]
+
+    def number_of_nodes(self):
+        """
+        An alias of number_of_vertices()
+        """
+        return self.number_of_vertices()
+
+    def number_of_vertices(self):
+        """
+        Get the number of vertices in the graph.
+        """
+        return self.metadata["number_of_nodes"]
+
+    def number_of_edges(self):
+        """
+        Get the number of edges in the graph.
+        """
+        return self.metadata["number_of_edges"]
+
 
 def download_all(force=False):
     """
diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py
index 643d0468d46..c2a4f7c6072 100644
--- a/python/cugraph/cugraph/tests/utils/test_dataset.py
+++ b/python/cugraph/cugraph/tests/utils/test_dataset.py
@@ -328,6 +328,16 @@ def test_is_multigraph(dataset):
     assert G.is_multigraph() == dataset.metadata["is_multigraph"]
 
 
+@pytest.mark.parametrize("dataset", ALL_DATASETS)
+def test_object_getters(dataset):
+    assert dataset.is_directed() == dataset.metadata["is_directed"]
+    assert dataset.is_multigraph() == dataset.metadata["is_multigraph"]
+    assert dataset.is_symmetric() == dataset.metadata["is_symmetric"]
+    assert dataset.number_of_nodes() == dataset.metadata["number_of_nodes"]
+    assert dataset.number_of_vertices() == dataset.metadata["number_of_nodes"]
+    assert dataset.number_of_edges() == dataset.metadata["number_of_edges"]
+
+
 #
 # Test experimental for DeprecationWarnings
 #

From 17b34479094e42e1401d0e5354d8da98672ba291 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Fri, 8 Sep 2023 13:38:22 -0500
Subject: [PATCH 03/22] Uses `conda mambabuild` rather than `mamba mambabuild`
 (#3853)

Applies same changes for the same reasons as cuDF PR https://github.com/rapidsai/cudf/pull/14067 to cuGraph.

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/3853
---
 ci/build_cpp.sh    |  2 +-
 ci/build_python.sh | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 3fd57f24c40..3fb72cac08b 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -11,6 +11,6 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-rapids-mamba-retry mambabuild conda/recipes/libcugraph
+rapids-conda-retry mambabuild conda/recipes/libcugraph
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 429ba649d1d..62eb6c2ccec 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -15,12 +15,12 @@ rapids-logger "Begin py build"
 
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcugraph
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -30,7 +30,7 @@ rapids-mamba-retry mambabuild \
 # platform to ensure it is included in each set of artifacts, since test
 # scripts only install from one set of artifacts based on the CUDA version used
 # for the test run.
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -40,7 +40,7 @@ rapids-mamba-retry mambabuild \
 # built on each CUDA platform to ensure they are included in each set of
 # artifacts, since test scripts only install from one set of artifacts based on
 # the CUDA version used for the test run.
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -50,7 +50,7 @@ RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
 if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
   # Only CUDA 11 is supported right now due to PyTorch requirement.
-  rapids-mamba-retry mambabuild \
+  rapids-conda-retry mambabuild \
     --no-test \
     --channel "${CPP_CHANNEL}" \
     --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -60,7 +60,7 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
     conda/recipes/cugraph-pyg
 
   # Only CUDA 11 is supported right now due to PyTorch requirement.
-  rapids-mamba-retry mambabuild \
+  rapids-conda-retry mambabuild \
     --no-test \
     --channel "${CPP_CHANNEL}" \
     --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \

From e55c131e35081e368db9b315a5f9706e048709f8 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Wed, 13 Sep 2023 13:01:24 -0400
Subject: [PATCH 04/22] Fix subtle memory leak in nbr_intersection primitive
 (#3858)

Closes https://github.com/rapidsai/graph_dl/issues/259

A customer found a subtle memory leak in Jaccard similarity.  Tracked it down to this subtle error.

`major_nbr_indices` is an `std::optional` that is initialized to `std::nullopt`.  Overwriting the dereferenced entry replaces the value but does not mark the optional as containing a value.  So the resulting value is never destroyed.

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/3858
---
 cpp/src/prims/detail/nbr_intersection.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index f4c4745b14c..2f30faebb3e 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -1023,7 +1023,7 @@ nbr_intersection(raft::handle_t const& handle,
                                (*major_nbr_offsets).begin() + 1);
       }
 
-      std::tie(*major_nbr_indices, std::ignore) = shuffle_values(
+      std::tie(major_nbr_indices, std::ignore) = shuffle_values(
         major_comm, local_nbrs_for_rx_majors.begin(), local_nbr_counts, handle.get_stream());
 
       if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {

From 5f7616173069cee5d856348f6084684962c670d6 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Mon, 18 Sep 2023 21:21:14 -0700
Subject: [PATCH 05/22] Sampling post processing functions to accelerate MFG
 creation. (#3815)

Closes #3787
Closes #3788

Added C++ functions to accelerate the MFG (message flow graph) creation step after sampling in the end-to-end GNN workflow.

Three C++ public functions are added to accelerate the GNN workflow.

1) renumbering + compression (CSR/DCSR/CSC/DCSC)
2) renumbering + sorting (COO)
3) sorting only (COO)

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Joseph Nke (https://github.com/jnke2016)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3815
---
 cpp/CMakeLists.txt                            |    1 +
 .../cugraph/detail/utility_wrappers.hpp       |    4 +-
 cpp/include/cugraph/graph_functions.hpp       |    4 +
 cpp/include/cugraph/sampling_functions.hpp    |  296 +++
 cpp/src/c_api/uniform_neighbor_sampling.cpp   |    2 +-
 cpp/src/prims/kv_store.cuh                    |    1 +
 ...r_v_random_select_transform_outgoing_e.cuh |    4 +-
 .../renumber_sampled_edgelist_impl.cuh        |    2 +
 .../sampling/renumber_sampled_edgelist_sg.cu  |    3 +-
 .../sampling_post_processing_impl.cuh         | 1800 +++++++++++++++++
 .../sampling/sampling_post_processing_sg.cu   |  389 ++++
 cpp/tests/CMakeLists.txt                      |    6 +-
 .../renumber_sampled_edgelist_test.cu         |  512 -----
 .../sampling/sampling_post_processing_test.cu | 1457 +++++++++++++
 14 files changed, 3960 insertions(+), 521 deletions(-)
 create mode 100644 cpp/include/cugraph/sampling_functions.hpp
 create mode 100644 cpp/src/sampling/sampling_post_processing_impl.cuh
 create mode 100644 cpp/src/sampling/sampling_post_processing_sg.cu
 delete mode 100644 cpp/tests/sampling/renumber_sampled_edgelist_test.cu
 create mode 100644 cpp/tests/sampling/sampling_post_processing_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 370e665106d..69a488de0b8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -228,6 +228,7 @@ set(CUGRAPH_SOURCES
     src/sampling/uniform_neighbor_sampling_mg.cpp
     src/sampling/uniform_neighbor_sampling_sg.cpp
     src/sampling/renumber_sampled_edgelist_sg.cu
+    src/sampling/sampling_post_processing_sg.cu
     src/cores/core_number_sg.cu
     src/cores/core_number_mg.cu
     src/cores/k_core_sg.cu
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
index a15dbf34cf9..faa0fbb841b 100644
--- a/cpp/include/cugraph/detail/utility_wrappers.hpp
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -37,8 +37,8 @@ namespace detail {
  * @param[in]   stream_view  stream view
  * @param[out]  d_value      device array to fill
  * @param[in]   size         number of elements in array
- * @param[in]   min_value    minimum value
- * @param[in]   max_value    maximum value
+ * @param[in]   min_value    minimum value (inclusive)
+ * @param[in]   max_value    maximum value (exclusive)
  * @param[in]   rng_state    The RngState instance holding pseudo-random number generator state.
  *
  */
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 200ee725b7a..5c1e9d5311f 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -919,6 +919,10 @@ rmm::device_uvector<vertex_t> select_random_vertices(
 /**
  * @brief renumber sampling output
  *
+ * @deprecated This API will be deprecated and will be replaced by the
+ * renumber_and_compress_sampled_edgelist and renumber_and_sort_sampled_edgelist functions in
+ * sampling_functions.hpp.
+ *
  * This function renumbers sampling function (e.g. uniform_neighbor_sample) outputs satisfying the
  * following requirements.
  *
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
new file mode 100644
index 00000000000..e42ef9bfcf3
--- /dev/null
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <optional>
+#include <tuple>
+
+namespace cugraph {
+
+/*
+ * @brief renumber sampled edge list and compress to the (D)CSR|(D)CSC format.
+ *
+ * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
+ * the following requirements. Assume major = source if @p src_is_major is true, major = destination
+ * if @p src_is_major is false.
+ *
+ * 1. If @p edgelist_hops is valid, we can consider (vertex ID, hop, flag=major) triplets for each
+ * vertex ID in edge majors (@p edgelist_srcs if @p src_is_major is true, @p edgelist_dsts if false)
+ * and (vertex ID, hop, flag=minor) triplets for each vertex ID in edge minors. From these triplets,
+ * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
+ * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
+ * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
+ * appear only in edge minors.
+ * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
+ * renumbered separately.
+ *
+ * The renumbered edges are compressed based on the following requirements.
+ *
+ * 1. If @p compress_per_hop is true, edges are compressed separately for each hop. If @p
+ * compress_per_hop is false, edges with different hop numbers are compressed altogether.
+ * 2. Edges are compressed independently for different labels.
+ * 3. If @p doubly_compress is false, edges are compressed to CSR (if @p src_is_major is true) or
+ * CSC (if @p src_is_major is false). If @p doubly_compress is true, edges are compressed to DCSR
+ * (if @p src_is_major is true) or DCSC (if @p src_is_major is false). If @p doubly_compress is
+ * false, the CSR/CSC offset array size is the number of vertices (which is the maximum vertex ID +
+ * 1) + 1. Here, the maximum vertex ID is the maximum major vertex ID in the edges to compress if @p
+ * compress_per_hop is false or for hop 0. If @p compress_per_hop is true and hop number is 1 or
+ * larger, the maximum vertex ID is the larger of the maximum major vertex ID for this hop and the
+ * maximum vertex ID for the edges in the previous hops.
+ *
+ * If both @p compress_per_hop is false and @p edgelist_hops.has_value() is true, majors should be
+ * non-decreasing within each label after renumbering and sorting by (hop, major, minor). Also,
+ * majors in hop N should not appear in any of the previous hops. This condition is satisfied if
+ * majors in hop N + 1 does not have any vertices from the previous hops excluding the minors from
+ * hop N.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and compression.
+ * @param compress_per_hop A flag to determine whether to compress edges with different hop numbers
+ * separately (if true) or altogether (if false). If @p compress_per_hop is true, @p
+ * edgelist_hops.has_value() should be true and @p doubly_compress should be false.
+ * @param doubly_compress A flag to determine whether to compress to the CSR/CSC format (if false)
+ * or the DCSR/DCSC format (if true).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing optional DCSR/DCSC major vertex IDs with one or more neighbors,
+ * (D)CSR|(D)CSC offset values, edge minor vertex IDs, optional edge weights (valid only if @p
+ * edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the
+ * (D)CSR|(D)CSC offset array (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is rue), renumber_map to query original vertices (size = # unique
+ * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
+ * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * edgelist_label_offsets.has_value() is true).
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<std::optional<rmm::device_uvector<vertex_t>>,     // dcsr/dcsc major vertices
+           rmm::device_uvector<size_t>,                      // (d)csr/(d)csc offset values
+           rmm::device_uvector<vertex_t>,                    // minor vertices
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,  // (label, hop) offsets to the (d)csr/(d)csc
+                                                        // offset array
+           rmm::device_uvector<vertex_t>,               // renumber map
+           std::optional<rmm::device_uvector<size_t>>>  // label offsets to the renumber map
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool compress_per_hop   = false,
+  bool doubly_compress    = false,
+  bool do_expensive_check = false);
+
+/*
+ * @brief renumber sampled edge list and sort the renumbered edges.
+ *
+ * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
+ * the following requirements. Assume major = source if @p src_is_major is true, major = destination
+ * if @p src_is_major is false.
+ *
+ * 1. If @p edgelist_hops is valid, we can consider (vertex ID, hop, flag=major) triplets for each
+ * vertex ID in edge majors (@p edgelist_srcs if @p src_is_major is true, @p edgelist_dsts if false)
+ * and (vertex ID, hop, flag=minor) triplets for each vertex ID in edge minors. From these triplets,
+ * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
+ * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
+ * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
+ * appear only in edge minors.
+ * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
+ * renumbered separately.
+ *
+ * The renumbered edges are sorted based on the following rules.
+ *
+ * 1. If @p src_is_major is true, use ((hop), src, dst) as the key in sorting. If @p src_is_major is
+ * false, use ((hop), dst, src) instead. hop is used only if @p edgelist_hops.has_value() is true.
+ * 2. Edges in each label are sorted independently if @p edgelist_label_offsets.has_value() is true.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
+ * non-decreasing within each label.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and sorting.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
+ * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the renumbered
+ * and sorted edges (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is true), renumber_map to query original vertices (size = # unique
+ * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
+ * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * edgelist_label_offsets.has_value() is true).
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,       // (label, hop) offsets to the edges
+           rmm::device_uvector<vertex_t>,                    // renumber map
+           std::optional<rmm::device_uvector<size_t>>>       // label offsets to the renumber map
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool do_expensive_check = false);
+
+/*
+ * @brief sort sampled edge list.
+ *
+ * Sampled edges are sorted based on the following rules.
+ *
+ * 1. If @p src_is_major is true, use ((hop), src, dst) as the key in sorting. If @p src_is_major is
+ * false, use ((hop), dst, src) instead. hop is used only if @p edgelist_hops.has_value() is true.
+ * 2. Edges in each label are sorted independently if @p edgelist_label_offsets.has_value() is true.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
+ * non-decreasing within each label.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and sorting.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
+ * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), and optional (label, hop) offset values to the
+ * renumbered and sorted edges (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is true)
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool do_expensive_check = false);
+
+}  // namespace cugraph
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index caaba8e9c8d..f146c331d8c 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -25,7 +25,7 @@
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
 
 #include <raft/core/handle.hpp>
 
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 8490bacfd9c..c46e83aa5da 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -31,6 +31,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index b238b964ede..3375a651982 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -108,7 +108,7 @@ struct convert_pair_to_quadruplet_t {
             thrust::seq, displacement_first, displacement_first + minor_comm_size, nbr_idx))) -
         1;
       local_nbr_idx -= *(displacement_first + minor_comm_rank);
-      cuda::std::atomic_ref<size_t> counter(tx_counts[minor_comm_rank]);
+      cuda::atomic_ref<size_t, cuda::thread_scope_device> counter(tx_counts[minor_comm_rank]);
       intra_partition_offset = counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
     }
     return thrust::make_tuple(minor_comm_rank, intra_partition_offset, local_nbr_idx, key_idx);
@@ -252,7 +252,7 @@ struct count_t {
 
   __device__ size_t operator()(size_t key_idx) const
   {
-    cuda::std::atomic_ref<int32_t> counter(sample_counts[key_idx]);
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> counter(sample_counts[key_idx]);
     return counter.fetch_add(int32_t{1}, cuda::std::memory_order_relaxed);
   }
 };
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh b/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
index 6fdb1c887f2..50f42851a1f 100644
--- a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
+++ b/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
@@ -19,6 +19,7 @@
 #include <prims/kv_store.cuh>
 
 #include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/misc_utils.cuh>
 
 #include <raft/core/handle.hpp>
@@ -41,6 +42,7 @@
 
 #include <optional>
 
+// FIXME: deprecated, to be deleted
 namespace cugraph {
 
 namespace {
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_sg.cu b/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
index 46e2264a0c1..9a5f0d357b2 100644
--- a/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
+++ b/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
 
 #include "renumber_sampled_edgelist_impl.cuh"
 
+// FIXME: deprecated, to be deleted
 namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>,
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
new file mode 100644
index 00000000000..ff8da72ff35
--- /dev/null
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -0,0 +1,1800 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <prims/kv_store.cuh>
+
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/misc_utils.cuh>
+
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/cub.cuh>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+#include <thrust/unique.h>
+
+#include <optional>
+
+namespace cugraph {
+
+namespace {
+
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+struct edge_order_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  raft::device_span<vertex_t const> edgelist_majors{};
+  raft::device_span<vertex_t const> edgelist_minors{};
+
+  __device__ bool operator()(size_t l_idx, size_t r_idx) const
+  {
+    if (edgelist_label_offsets) {
+      auto l_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          (*edgelist_label_offsets)[0] + l_idx));
+      auto r_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          (*edgelist_label_offsets)[0] + r_idx));
+      if (l_label != r_label) { return l_label < r_label; }
+    }
+
+    if (edgelist_hops) {
+      auto l_hop = (*edgelist_hops)[l_idx];
+      auto r_hop = (*edgelist_hops)[r_idx];
+      if (l_hop != r_hop) { return l_hop < r_hop; }
+    }
+
+    auto l_major = edgelist_majors[l_idx];
+    auto r_major = edgelist_majors[r_idx];
+    if (l_major != r_major) { return l_major < r_major; }
+
+    auto l_minor = edgelist_minors[l_idx];
+    auto r_minor = edgelist_minors[r_idx];
+    if (l_minor != r_minor) { return l_minor < r_minor; }
+
+    return l_idx < r_idx;
+  }
+};
+
+template <typename vertex_t>
+struct is_first_in_run_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  raft::device_span<vertex_t const> edgelist_majors{};
+
+  __device__ bool operator()(size_t i) const
+  {
+    if (i == 0) return true;
+    if (edgelist_label_offsets) {
+      auto prev_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                         thrust::upper_bound(thrust::seq,
+                                                             (*edgelist_label_offsets).begin() + 1,
+                                                             (*edgelist_label_offsets).end(),
+                                                             i - 1));
+      auto this_label = thrust::distance(
+        (*edgelist_label_offsets).begin() + 1,
+        thrust::upper_bound(
+          thrust::seq, (*edgelist_label_offsets).begin() + 1, (*edgelist_label_offsets).end(), i));
+      if (this_label != prev_label) { return true; }
+    }
+    if (edgelist_hops) {
+      auto prev_hop = (*edgelist_hops)[i - 1];
+      auto this_hop = (*edgelist_hops)[i];
+      if (this_hop != prev_hop) { return true; }
+    }
+    return edgelist_majors[i] != edgelist_majors[i - 1];
+  }
+};
+
+template <typename label_index_t>
+struct compute_label_index_t {
+  raft::device_span<size_t const> edgelist_label_offsets{};
+
+  __device__ label_index_t operator()(size_t i) const
+  {
+    return static_cast<label_index_t>(thrust::distance(
+      edgelist_label_offsets.begin() + 1,
+      thrust::upper_bound(
+        thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), i)));
+  }
+};
+
+template <typename label_index_t>
+struct optionally_compute_label_index_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+
+  __device__ label_index_t operator()(size_t i) const
+  {
+    return edgelist_label_offsets ? static_cast<label_index_t>(thrust::distance(
+                                      (*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          i)))
+                                  : label_index_t{0};
+  }
+};
+
+template <typename label_index_t,
+          typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+void check_input_edges(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t> const& edgelist_srcs,
+  rmm::device_uvector<vertex_t> const& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>> const& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>> const& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>> const& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>> const& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(!edgelist_label_offsets || (std::get<1>(*edgelist_label_offsets) <=
+                                              std::numeric_limits<label_index_t>::max()),
+                  "Invalid input arguments: current implementation assumes that the number of "
+                  "unique labels is no larger than std::numeric_limits<uint32_t>::max().");
+  CUGRAPH_EXPECTS(!edgelist_label_offsets || std::get<1>(*edgelist_label_offsets) > 0,
+                  "Invlaid input arguments: there should be 1 or more labels if "
+                  "edgelist_label_offsets.has_value() is true.");
+  CUGRAPH_EXPECTS(
+    !edgelist_label_offsets.has_value() ||
+      (std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1),
+    "Invalid input arguments: if edgelist_label_offsets is valid, "
+    "std::get<0>(*edgelist_label_offsets).size() (size of the offset array) should be "
+    "std::get<1>(*edgelist_label_offsets) (number of unique labels) + 1.");
+
+  CUGRAPH_EXPECTS(
+    !edgelist_hops || (std::get<1>(*edgelist_hops) <= std::numeric_limits<int32_t>::max()),
+    "Invalid input arguments: current implementation assumes that the number of "
+    "hops is no larger than std::numeric_limits<int32_t>::max().");
+  CUGRAPH_EXPECTS(!edgelist_hops || std::get<1>(*edgelist_hops) > 0,
+                  "Invlaid input arguments: number of hops should be larger than 0 if "
+                  "edgelist_hops.has_value() is true.");
+
+  CUGRAPH_EXPECTS(
+    edgelist_srcs.size() == edgelist_dsts.size(),
+    "Invalid input arguments: edgelist_srcs.size() and edgelist_dsts.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_weights.has_value() || (edgelist_srcs.size() == (*edgelist_weights).size()),
+    "Invalid input arguments: if edgelist_weights is valid, std::get<0>(*edgelist_weights).size() "
+    "and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_edge_ids.has_value() || (edgelist_srcs.size() == (*edgelist_edge_ids).size()),
+    "Invalid input arguments: if edgelist_edge_ids is valid, "
+    "std::get<0>(*edgelist_edge_ids).size() and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_edge_types.has_value() || (edgelist_srcs.size() == (*edgelist_edge_types).size()),
+    "Invalid input arguments: if edgelist_edge_types is valid, "
+    "std::get<0>(*edgelist_edge_types).size() and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_hops.has_value() || (edgelist_srcs.size() == std::get<0>(*edgelist_hops).size()),
+    "Invalid input arguments: if edgelist_hops is valid, std::get<0>(*edgelist_hops).size() and "
+    "edgelist_srcs.size() should coincide.");
+
+  if (do_expensive_check) {
+    if (edgelist_label_offsets) {
+      CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),
+                                        std::get<0>(*edgelist_label_offsets).begin(),
+                                        std::get<0>(*edgelist_label_offsets).end()),
+                      "Invalid input arguments: if edgelist_label_offsets is valid, "
+                      "std::get<0>(*edgelist_label_offsets) should be sorted.");
+      size_t back_element{};
+      raft::update_host(
+        &back_element,
+        std::get<0>(*edgelist_label_offsets).data() + std::get<1>(*edgelist_label_offsets),
+        size_t{1},
+        handle.get_stream());
+      handle.get_stream();
+      CUGRAPH_EXPECTS(
+        back_element == edgelist_srcs.size(),
+        "Invalid input arguments: if edgelist_label_offsets is valid, the last element of "
+        "std::get<0>(*edgelist_label_offsets) and edgelist_srcs.size() should coincide.");
+    }
+  }
+}
+
+// output sorted by (primary key:label_index, secondary key:vertex)
+template <typename vertex_t, typename label_index_t>
+std::tuple<std::optional<rmm::device_uvector<label_index_t>> /* label indices */,
+           rmm::device_uvector<vertex_t> /* vertices */,
+           std::optional<rmm::device_uvector<int32_t>> /* minimum hops for the vertices */,
+           std::optional<rmm::device_uvector<size_t>> /* label offsets for the output */>
+compute_min_hop_for_unique_label_vertex_pairs(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  std::optional<raft::device_span<int32_t const>> hops,
+  std::optional<raft::device_span<label_index_t const>> label_indices,
+  std::optional<raft::device_span<size_t const>> label_offsets)
+{
+  auto approx_edges_to_sort_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+    (1 << 20) /* tuning parameter */;  // for segmented sort
+
+  if (label_indices) {
+    auto num_labels = (*label_offsets).size() - 1;
+
+    rmm::device_uvector<label_index_t> tmp_label_indices((*label_indices).size(),
+                                                         handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*label_indices).begin(),
+                 (*label_indices).end(),
+                 tmp_label_indices.begin());
+
+    rmm::device_uvector<vertex_t> tmp_vertices(0, handle.get_stream());
+    std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
+
+    if (hops) {
+      tmp_vertices.resize(vertices.size(), handle.get_stream());
+      thrust::copy(
+        handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
+      tmp_hops = rmm::device_uvector<int32_t>((*hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), (*tmp_hops).begin());
+
+      auto triplet_first = thrust::make_zip_iterator(
+        tmp_label_indices.begin(), tmp_vertices.begin(), (*tmp_hops).begin());
+      thrust::sort(
+        handle.get_thrust_policy(), triplet_first, triplet_first + tmp_label_indices.size());
+      auto key_first   = thrust::make_zip_iterator(tmp_label_indices.begin(), tmp_vertices.begin());
+      auto num_uniques = static_cast<size_t>(
+        thrust::distance(key_first,
+                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                              key_first,
+                                                              key_first + tmp_label_indices.size(),
+                                                              (*tmp_hops).begin()))));
+      tmp_label_indices.resize(num_uniques, handle.get_stream());
+      tmp_vertices.resize(num_uniques, handle.get_stream());
+      (*tmp_hops).resize(num_uniques, handle.get_stream());
+      tmp_label_indices.shrink_to_fit(handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+      (*tmp_hops).shrink_to_fit(handle.get_stream());
+    } else {
+      rmm::device_uvector<vertex_t> segment_sorted_vertices(vertices.size(), handle.get_stream());
+
+      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+      auto [h_label_offsets, h_edge_offsets] =
+        detail::compute_offset_aligned_edge_chunks(handle,
+                                                   (*label_offsets).data(),
+                                                   num_labels,
+                                                   vertices.size(),
+                                                   approx_edges_to_sort_per_iteration);
+      auto num_chunks = h_label_offsets.size() - 1;
+
+      for (size_t i = 0; i < num_chunks; ++i) {
+        size_t tmp_storage_bytes{0};
+
+        auto offset_first =
+          thrust::make_transform_iterator((*label_offsets).data() + h_label_offsets[i],
+                                          detail::shift_left_t<size_t>{h_edge_offsets[i]});
+        cub::DeviceSegmentedSort::SortKeys(static_cast<void*>(nullptr),
+                                           tmp_storage_bytes,
+                                           vertices.begin() + h_edge_offsets[i],
+                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
+                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                           h_label_offsets[i + 1] - h_label_offsets[i],
+                                           offset_first,
+                                           offset_first + 1,
+                                           handle.get_stream());
+
+        if (tmp_storage_bytes > d_tmp_storage.size()) {
+          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+        }
+
+        cub::DeviceSegmentedSort::SortKeys(d_tmp_storage.data(),
+                                           tmp_storage_bytes,
+                                           vertices.begin() + h_edge_offsets[i],
+                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
+                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                           h_label_offsets[i + 1] - h_label_offsets[i],
+                                           offset_first,
+                                           offset_first + 1,
+                                           handle.get_stream());
+      }
+      d_tmp_storage.resize(0, handle.get_stream());
+      d_tmp_storage.shrink_to_fit(handle.get_stream());
+
+      auto pair_first =
+        thrust::make_zip_iterator(tmp_label_indices.begin(), segment_sorted_vertices.begin());
+      auto num_uniques = static_cast<size_t>(thrust::distance(
+        pair_first,
+        thrust::unique(
+          handle.get_thrust_policy(), pair_first, pair_first + tmp_label_indices.size())));
+      tmp_label_indices.resize(num_uniques, handle.get_stream());
+      segment_sorted_vertices.resize(num_uniques, handle.get_stream());
+      tmp_label_indices.shrink_to_fit(handle.get_stream());
+      segment_sorted_vertices.shrink_to_fit(handle.get_stream());
+
+      tmp_vertices = std::move(segment_sorted_vertices);
+    }
+
+    rmm::device_uvector<size_t> tmp_label_offsets(num_labels + 1, handle.get_stream());
+    tmp_label_offsets.set_element_to_zero_async(0, handle.get_stream());
+    thrust::upper_bound(handle.get_thrust_policy(),
+                        tmp_label_indices.begin(),
+                        tmp_label_indices.end(),
+                        thrust::make_counting_iterator(size_t{0}),
+                        thrust::make_counting_iterator(num_labels),
+                        tmp_label_offsets.begin() + 1);
+
+    return std::make_tuple(std::move(tmp_label_indices),
+                           std::move(tmp_vertices),
+                           std::move(tmp_hops),
+                           std::move(tmp_label_offsets));
+  } else {
+    rmm::device_uvector<vertex_t> tmp_vertices(vertices.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
+
+    if (hops) {
+      rmm::device_uvector<int32_t> tmp_hops((*hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), tmp_hops.begin());
+
+      auto pair_first = thrust::make_zip_iterator(
+        tmp_vertices.begin(), tmp_hops.begin());  // vertex is a primary key, hop is a secondary key
+      thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_vertices.size());
+      tmp_vertices.resize(
+        thrust::distance(tmp_vertices.begin(),
+                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                              tmp_vertices.begin(),
+                                                              tmp_vertices.end(),
+                                                              tmp_hops.begin()))),
+        handle.get_stream());
+      tmp_hops.resize(tmp_vertices.size(), handle.get_stream());
+
+      return std::make_tuple(
+        std::nullopt, std::move(tmp_vertices), std::move(tmp_hops), std::nullopt);
+    } else {
+      thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end());
+      tmp_vertices.resize(
+        thrust::distance(
+          tmp_vertices.begin(),
+          thrust::unique(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end())),
+        handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+
+      return std::make_tuple(std::nullopt, std::move(tmp_vertices), std::nullopt, std::nullopt);
+    }
+  }
+}
+
+template <typename vertex_t, typename label_index_t>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<label_index_t>>>
+compute_renumber_map(raft::handle_t const& handle,
+                     raft::device_span<vertex_t const> edgelist_majors,
+                     raft::device_span<vertex_t const> edgelist_minors,
+                     std::optional<raft::device_span<int32_t const>> edgelist_hops,
+                     std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
+{
+  auto approx_edges_to_sort_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+    (1 << 20) /* tuning parameter */;  // for segmented sort
+
+  std::optional<rmm::device_uvector<label_index_t>> edgelist_label_indices{std::nullopt};
+  if (edgelist_label_offsets) {
+    edgelist_label_indices =
+      detail::expand_sparse_offsets(*edgelist_label_offsets, label_index_t{0}, handle.get_stream());
+  }
+
+  auto [unique_label_major_pair_label_indices,
+        unique_label_major_pair_vertices,
+        unique_label_major_pair_hops,
+        unique_label_major_pair_label_offsets] =
+    compute_min_hop_for_unique_label_vertex_pairs(
+      handle,
+      edgelist_majors,
+      edgelist_hops,
+      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
+                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
+                             : std::nullopt,
+      edgelist_label_offsets);
+
+  auto [unique_label_minor_pair_label_indices,
+        unique_label_minor_pair_vertices,
+        unique_label_minor_pair_hops,
+        unique_label_minor_pair_label_offsets] =
+    compute_min_hop_for_unique_label_vertex_pairs(
+      handle,
+      edgelist_minors,
+      edgelist_hops,
+      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
+                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
+                             : std::nullopt,
+      edgelist_label_offsets);
+
+  edgelist_label_indices = std::nullopt;
+
+  if (edgelist_label_offsets) {
+    auto num_labels = (*edgelist_label_offsets).size() - 1;
+
+    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+    rmm::device_uvector<label_index_t> renumber_map_label_indices(0, handle.get_stream());
+
+    renumber_map.reserve((*unique_label_major_pair_label_indices).size() +
+                           (*unique_label_minor_pair_label_indices).size(),
+                         handle.get_stream());
+    renumber_map_label_indices.reserve(renumber_map.capacity(), handle.get_stream());
+
+    auto num_chunks = (edgelist_majors.size() + (approx_edges_to_sort_per_iteration - 1)) /
+                      approx_edges_to_sort_per_iteration;
+    auto chunk_size = (num_chunks > 0) ? ((num_labels + (num_chunks - 1)) / num_chunks) : 0;
+
+    size_t copy_offset{0};
+    for (size_t i = 0; i < num_chunks; ++i) {
+      auto major_start_offset =
+        (*unique_label_major_pair_label_offsets).element(chunk_size * i, handle.get_stream());
+      auto major_end_offset =
+        (*unique_label_major_pair_label_offsets)
+          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
+      auto minor_start_offset =
+        (*unique_label_minor_pair_label_offsets).element(chunk_size * i, handle.get_stream());
+      auto minor_end_offset =
+        (*unique_label_minor_pair_label_offsets)
+          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
+
+      rmm::device_uvector<label_index_t> merged_label_indices(
+        (major_end_offset - major_start_offset) + (minor_end_offset - minor_start_offset),
+        handle.get_stream());
+      rmm::device_uvector<vertex_t> merged_vertices(merged_label_indices.size(),
+                                                    handle.get_stream());
+      rmm::device_uvector<int8_t> merged_flags(merged_label_indices.size(), handle.get_stream());
+
+      if (edgelist_hops) {
+        rmm::device_uvector<int32_t> merged_hops(merged_label_indices.size(), handle.get_stream());
+        auto major_quad_first =
+          thrust::make_zip_iterator((*unique_label_major_pair_label_indices).begin(),
+                                    unique_label_major_pair_vertices.begin(),
+                                    (*unique_label_major_pair_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{0}));
+        auto minor_quad_first =
+          thrust::make_zip_iterator((*unique_label_minor_pair_label_indices).begin(),
+                                    unique_label_minor_pair_vertices.begin(),
+                                    (*unique_label_minor_pair_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{1}));
+        thrust::merge(handle.get_thrust_policy(),
+                      major_quad_first + major_start_offset,
+                      major_quad_first + major_end_offset,
+                      minor_quad_first + minor_start_offset,
+                      minor_quad_first + minor_end_offset,
+                      thrust::make_zip_iterator(merged_label_indices.begin(),
+                                                merged_vertices.begin(),
+                                                merged_hops.begin(),
+                                                merged_flags.begin()));
+
+        auto unique_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
+        merged_label_indices.resize(
+          thrust::distance(
+            unique_key_first,
+            thrust::get<0>(thrust::unique_by_key(
+              handle.get_thrust_policy(),
+              unique_key_first,
+              unique_key_first + merged_label_indices.size(),
+              thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+          handle.get_stream());
+        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
+        merged_hops.resize(merged_label_indices.size(), handle.get_stream());
+        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
+        auto sort_key_first = thrust::make_zip_iterator(
+          merged_label_indices.begin(), merged_hops.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_label_indices.size(),
+                            merged_vertices.begin());
+      } else {
+        auto major_triplet_first =
+          thrust::make_zip_iterator((*unique_label_major_pair_label_indices).begin(),
+                                    unique_label_major_pair_vertices.begin(),
+                                    thrust::make_constant_iterator(int8_t{0}));
+        auto minor_triplet_first =
+          thrust::make_zip_iterator((*unique_label_minor_pair_label_indices).begin(),
+                                    unique_label_minor_pair_vertices.begin(),
+                                    thrust::make_constant_iterator(int8_t{1}));
+        thrust::merge(
+          handle.get_thrust_policy(),
+          major_triplet_first + major_start_offset,
+          major_triplet_first + major_end_offset,
+          minor_triplet_first + minor_start_offset,
+          minor_triplet_first + minor_end_offset,
+          thrust::make_zip_iterator(
+            merged_label_indices.begin(), merged_vertices.begin(), merged_flags.begin()));
+
+        auto unique_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
+        merged_label_indices.resize(
+          thrust::distance(
+            unique_key_first,
+            thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                 unique_key_first,
+                                                 unique_key_first + merged_label_indices.size(),
+                                                 merged_flags.begin()))),
+          handle.get_stream());
+        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
+        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
+        auto sort_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_label_indices.size(),
+                            merged_vertices.begin());
+      }
+
+      renumber_map.resize(copy_offset + merged_vertices.size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   merged_vertices.begin(),
+                   merged_vertices.end(),
+                   renumber_map.begin() + copy_offset);
+      renumber_map_label_indices.resize(copy_offset + merged_label_indices.size(),
+                                        handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   merged_label_indices.begin(),
+                   merged_label_indices.end(),
+                   renumber_map_label_indices.begin() + copy_offset);
+
+      copy_offset += merged_vertices.size();
+    }
+
+    renumber_map.shrink_to_fit(handle.get_stream());
+    renumber_map_label_indices.shrink_to_fit(handle.get_stream());
+
+    return std::make_tuple(std::move(renumber_map), std::move(renumber_map_label_indices));
+  } else {
+    if (edgelist_hops) {
+      rmm::device_uvector<vertex_t> merged_vertices(
+        unique_label_major_pair_vertices.size() + unique_label_minor_pair_vertices.size(),
+        handle.get_stream());
+      rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+      rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
+      auto major_triplet_first =
+        thrust::make_zip_iterator(unique_label_major_pair_vertices.begin(),
+                                  (*unique_label_major_pair_hops).begin(),
+                                  thrust::make_constant_iterator(int8_t{0}));
+      auto minor_triplet_first =
+        thrust::make_zip_iterator(unique_label_minor_pair_vertices.begin(),
+                                  (*unique_label_minor_pair_hops).begin(),
+                                  thrust::make_constant_iterator(int8_t{1}));
+      thrust::merge(handle.get_thrust_policy(),
+                    major_triplet_first,
+                    major_triplet_first + unique_label_major_pair_vertices.size(),
+                    minor_triplet_first,
+                    minor_triplet_first + unique_label_minor_pair_vertices.size(),
+                    thrust::make_zip_iterator(
+                      merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
+
+      unique_label_major_pair_vertices.resize(0, handle.get_stream());
+      unique_label_major_pair_vertices.shrink_to_fit(handle.get_stream());
+      unique_label_major_pair_hops = std::nullopt;
+      unique_label_minor_pair_vertices.resize(0, handle.get_stream());
+      unique_label_minor_pair_vertices.shrink_to_fit(handle.get_stream());
+      unique_label_minor_pair_hops = std::nullopt;
+
+      merged_vertices.resize(
+        thrust::distance(merged_vertices.begin(),
+                         thrust::get<0>(thrust::unique_by_key(
+                           handle.get_thrust_policy(),
+                           merged_vertices.begin(),
+                           merged_vertices.end(),
+                           thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+        handle.get_stream());
+      merged_hops.resize(merged_vertices.size(), handle.get_stream());
+      merged_flags.resize(merged_vertices.size(), handle.get_stream());
+
+      auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          merged_vertices.begin());
+
+      return std::make_tuple(std::move(merged_vertices), std::nullopt);
+    } else {
+      rmm::device_uvector<vertex_t> output_vertices(unique_label_minor_pair_vertices.size(),
+                                                    handle.get_stream());
+      auto output_last = thrust::set_difference(handle.get_thrust_policy(),
+                                                unique_label_minor_pair_vertices.begin(),
+                                                unique_label_minor_pair_vertices.end(),
+                                                unique_label_major_pair_vertices.begin(),
+                                                unique_label_major_pair_vertices.end(),
+                                                output_vertices.begin());
+
+      auto num_unique_majors = unique_label_major_pair_vertices.size();
+      auto renumber_map      = std::move(unique_label_major_pair_vertices);
+      renumber_map.resize(
+        renumber_map.size() + thrust::distance(output_vertices.begin(), output_last),
+        handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   output_vertices.begin(),
+                   output_last,
+                   renumber_map.begin() + num_unique_majors);
+
+      return std::make_tuple(std::move(renumber_map), std::nullopt);
+    }
+  }
+}
+
+// this function does not reorder edges (the i'th returned edge is the renumbered output of the i'th
+// input edge)
+template <typename vertex_t, typename label_index_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<size_t>>>
+renumber_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_majors,
+  rmm::device_uvector<vertex_t>&& edgelist_minors,
+  std::optional<std::tuple<raft::device_span<int32_t const>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool do_expensive_check)
+{
+  // 1. compute renumber_map
+
+  auto [renumber_map, renumber_map_label_indices] = compute_renumber_map<vertex_t, label_index_t>(
+    handle,
+    raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+    raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
+    edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                      std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                  : std::nullopt,
+    edgelist_label_offsets
+      ? std::make_optional<raft::device_span<size_t const>>(std::get<0>(*edgelist_label_offsets))
+      : std::nullopt);
+
+  // 2. compute renumber map offsets for each label
+
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{};
+  if (edgelist_label_offsets) {
+    auto num_unique_labels = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator((*renumber_map_label_indices).size()),
+      detail::is_first_in_run_t<label_index_t const*>{(*renumber_map_label_indices).data()});
+    rmm::device_uvector<label_index_t> unique_label_indices(num_unique_labels, handle.get_stream());
+    rmm::device_uvector<vertex_t> vertex_counts(num_unique_labels, handle.get_stream());
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          (*renumber_map_label_indices).begin(),
+                          (*renumber_map_label_indices).end(),
+                          thrust::make_constant_iterator(size_t{1}),
+                          unique_label_indices.begin(),
+                          vertex_counts.begin());
+
+    renumber_map_label_offsets =
+      rmm::device_uvector<size_t>(std::get<1>(*edgelist_label_offsets) + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*renumber_map_label_offsets).begin(),
+                 (*renumber_map_label_offsets).end(),
+                 size_t{0});
+    thrust::scatter(handle.get_thrust_policy(),
+                    vertex_counts.begin(),
+                    vertex_counts.end(),
+                    unique_label_indices.begin(),
+                    (*renumber_map_label_offsets).begin() + 1);
+
+    thrust::inclusive_scan(handle.get_thrust_policy(),
+                           (*renumber_map_label_offsets).begin(),
+                           (*renumber_map_label_offsets).end(),
+                           (*renumber_map_label_offsets).begin());
+  }
+
+  // 3. renumber input edges
+
+  if (edgelist_label_offsets) {
+    rmm::device_uvector<vertex_t> new_vertices(renumber_map.size(), handle.get_stream());
+    thrust::tabulate(handle.get_thrust_policy(),
+                     new_vertices.begin(),
+                     new_vertices.end(),
+                     [label_indices = raft::device_span<label_index_t const>(
+                        (*renumber_map_label_indices).data(), (*renumber_map_label_indices).size()),
+                      renumber_map_label_offsets = raft::device_span<size_t const>(
+                        (*renumber_map_label_offsets).data(),
+                        (*renumber_map_label_offsets).size())] __device__(size_t i) {
+                       auto label_index        = label_indices[i];
+                       auto label_start_offset = renumber_map_label_offsets[label_index];
+                       return static_cast<vertex_t>(i - label_start_offset);
+                     });
+
+    (*renumber_map_label_indices).resize(0, handle.get_stream());
+    (*renumber_map_label_indices).shrink_to_fit(handle.get_stream());
+
+    auto num_labels = std::get<0>(*edgelist_label_offsets).size();
+
+    rmm::device_uvector<vertex_t> segment_sorted_renumber_map(renumber_map.size(),
+                                                              handle.get_stream());
+    rmm::device_uvector<vertex_t> segment_sorted_new_vertices(new_vertices.size(),
+                                                              handle.get_stream());
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+    auto approx_edges_to_sort_per_iteration =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+      (1 << 20) /* tuning parameter */;  // for segmented sort
+
+    auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_edge_chunks(
+      handle,
+      (*renumber_map_label_offsets).data(),
+      static_cast<size_t>((*renumber_map_label_offsets).size() - 1),
+      renumber_map.size(),
+      approx_edges_to_sort_per_iteration);
+    auto num_chunks = h_label_offsets.size() - 1;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+      size_t tmp_storage_bytes{0};
+
+      auto offset_first =
+        thrust::make_transform_iterator((*renumber_map_label_offsets).data() + h_label_offsets[i],
+                                        detail::shift_left_t<size_t>{h_edge_offsets[i]});
+      cub::DeviceSegmentedSort::SortPairs(static_cast<void*>(nullptr),
+                                          tmp_storage_bytes,
+                                          renumber_map.begin() + h_edge_offsets[i],
+                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
+                                          new_vertices.begin() + h_edge_offsets[i],
+                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                          h_label_offsets[i + 1] - h_label_offsets[i],
+                                          offset_first,
+                                          offset_first + 1,
+                                          handle.get_stream());
+
+      if (tmp_storage_bytes > d_tmp_storage.size()) {
+        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+      }
+
+      cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(),
+                                          tmp_storage_bytes,
+                                          renumber_map.begin() + h_edge_offsets[i],
+                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
+                                          new_vertices.begin() + h_edge_offsets[i],
+                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                          h_label_offsets[i + 1] - h_label_offsets[i],
+                                          offset_first,
+                                          offset_first + 1,
+                                          handle.get_stream());
+    }
+    new_vertices.resize(0, handle.get_stream());
+    d_tmp_storage.resize(0, handle.get_stream());
+    new_vertices.shrink_to_fit(handle.get_stream());
+    d_tmp_storage.shrink_to_fit(handle.get_stream());
+
+    auto edgelist_label_indices = detail::expand_sparse_offsets(
+      std::get<0>(*edgelist_label_offsets), label_index_t{0}, handle.get_stream());
+
+    auto pair_first =
+      thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_label_indices.begin());
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_majors.size(),
+      edgelist_majors.begin(),
+      [renumber_map_label_offsets = raft::device_span<size_t const>(
+         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
+                                                        segment_sorted_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(
+         segment_sorted_new_vertices.data(),
+         segment_sorted_new_vertices.size())] __device__(auto pair) {
+        auto old_vertex         = thrust::get<0>(pair);
+        auto label_index        = thrust::get<1>(pair);
+        auto label_start_offset = renumber_map_label_offsets[label_index];
+        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto it                 = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + label_start_offset,
+                                      old_vertices.begin() + label_end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return *(new_vertices.begin() + thrust::distance(old_vertices.begin(), it));
+      });
+
+    pair_first = thrust::make_zip_iterator(edgelist_minors.begin(), edgelist_label_indices.begin());
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_minors.size(),
+      edgelist_minors.begin(),
+      [renumber_map_label_offsets = raft::device_span<size_t const>(
+         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
+                                                        segment_sorted_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(
+         segment_sorted_new_vertices.data(),
+         segment_sorted_new_vertices.size())] __device__(auto pair) {
+        auto old_vertex         = thrust::get<0>(pair);
+        auto label_index        = thrust::get<1>(pair);
+        auto label_start_offset = renumber_map_label_offsets[label_index];
+        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto it                 = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + label_start_offset,
+                                      old_vertices.begin() + label_end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return new_vertices[thrust::distance(old_vertices.begin(), it)];
+      });
+  } else {
+    kv_store_t<vertex_t, vertex_t, false> kv_store(renumber_map.begin(),
+                                                   renumber_map.end(),
+                                                   thrust::make_counting_iterator(vertex_t{0}),
+                                                   std::numeric_limits<vertex_t>::max(),
+                                                   std::numeric_limits<vertex_t>::max(),
+                                                   handle.get_stream());
+    auto kv_store_view = kv_store.view();
+
+    kv_store_view.find(
+      edgelist_majors.begin(), edgelist_majors.end(), edgelist_majors.begin(), handle.get_stream());
+    kv_store_view.find(
+      edgelist_minors.begin(), edgelist_minors.end(), edgelist_minors.begin(), handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(edgelist_majors),
+                         std::move(edgelist_minors),
+                         std::move(renumber_map),
+                         std::move(renumber_map_label_offsets));
+}
+
+template <typename IndexIterator, typename ValueIterator>
+void permute_array(raft::handle_t const& handle,
+                   IndexIterator index_first,
+                   IndexIterator index_last,
+                   ValueIterator value_first /* [INOUT] */)
+{
+  using value_t = typename thrust::iterator_traits<ValueIterator>::value_type;
+
+  auto tmp_buffer = allocate_dataframe_buffer<value_t>(thrust::distance(index_first, index_last),
+                                                       handle.get_stream());
+  thrust::gather(handle.get_thrust_policy(),
+                 index_first,
+                 index_last,
+                 value_first,
+                 get_dataframe_buffer_begin(tmp_buffer));
+  thrust::copy(handle.get_thrust_policy(),
+               get_dataframe_buffer_begin(tmp_buffer),
+               get_dataframe_buffer_end(tmp_buffer),
+               value_first);
+}
+
+// key: ((label), (hop), major, minor)
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_id_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>>
+sort_sampled_edge_tuples(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_majors,
+  rmm::device_uvector<vertex_t>&& edgelist_minors,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets)
+{
+  std::vector<size_t> h_label_offsets{};
+  std::vector<size_t> h_edge_offsets{};
+
+  if (edgelist_label_offsets) {
+    auto approx_edges_to_sort_per_iteration =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+      (1 << 20) /* tuning parameter */;  // for sorts in chunks
+
+    std::tie(h_label_offsets, h_edge_offsets) =
+      detail::compute_offset_aligned_edge_chunks(handle,
+                                                 std::get<0>(*edgelist_label_offsets).data(),
+                                                 std::get<1>(*edgelist_label_offsets),
+                                                 edgelist_majors.size(),
+                                                 approx_edges_to_sort_per_iteration);
+  } else {
+    h_label_offsets = {0, 1};
+    h_edge_offsets  = {0, edgelist_majors.size()};
+  }
+
+  auto num_chunks = h_label_offsets.size() - 1;
+  for (size_t i = 0; i < num_chunks; ++i) {
+    rmm::device_uvector<size_t> indices(h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                        handle.get_stream());
+    thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
+    edge_order_t<vertex_t, weight_t, edge_id_t, edge_type_t> edge_order_comp{
+      edgelist_label_offsets ? thrust::make_optional<raft::device_span<size_t const>>(
+                                 std::get<0>(*edgelist_label_offsets).data() + h_label_offsets[i],
+                                 (h_label_offsets[i + 1] - h_label_offsets[i]) + 1)
+                             : thrust::nullopt,
+      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                        std::get<0>(*edgelist_hops).data() + h_edge_offsets[i], indices.size())
+                    : thrust::nullopt,
+      raft::device_span<vertex_t const>(edgelist_majors.data() + h_edge_offsets[i], indices.size()),
+      raft::device_span<vertex_t const>(edgelist_minors.data() + h_edge_offsets[i],
+                                        indices.size())};
+    thrust::sort(handle.get_thrust_policy(), indices.begin(), indices.end(), edge_order_comp);
+
+    permute_array(handle,
+                  indices.begin(),
+                  indices.end(),
+                  thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()) +
+                    h_edge_offsets[i]);
+
+    if (edgelist_weights) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_weights).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_edge_ids) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_edge_ids).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_edge_types) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_edge_types).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_hops) {
+      permute_array(handle,
+                    indices.begin(),
+                    indices.end(),
+                    std::get<0>(*edgelist_hops).begin() + h_edge_offsets[i]);
+    }
+  }
+
+  return std::make_tuple(std::move(edgelist_majors),
+                         std::move(edgelist_minors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_hops));
+}
+
+}  // namespace
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<std::optional<rmm::device_uvector<vertex_t>>,     // dcsr/dcsc major vertices
+           rmm::device_uvector<size_t>,                      // (d)csr/(d)csc offset values
+           rmm::device_uvector<vertex_t>,                    // minor vertices
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,  // (label, hop) offsets to the (d)csr/(d)csc
+                                                        // offset array
+           rmm::device_uvector<vertex_t>,               // renumber map
+           std::optional<rmm::device_uvector<size_t>>>  // label offsets to the renumber map
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  CUGRAPH_EXPECTS(
+    !doubly_compress || !compress_per_hop,
+    "Invalid input arguments: compress_per_hop should be false if doubly_compress is true.");
+  CUGRAPH_EXPECTS(!compress_per_hop || edgelist_hops,
+                  "Invalid input arguments: edgelist_hops.has_value() should be true if "
+                  "compress_per_hop is true.");
+
+  // 2. renumber
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
+  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+    renumber_sampled_edgelist<vertex_t, label_index_t>(
+      handle,
+      std::move(edgelist_majors),
+      std::move(edgelist_minors),
+      edgelist_hops ? std::make_optional(std::make_tuple(
+                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
+                                                         std::get<0>(*edgelist_hops).size()),
+                        num_hops))
+                    : std::nullopt,
+      edgelist_label_offsets,
+      do_expensive_check);
+
+  // 3. sort by ((l), (h), major, minor)
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  if (do_expensive_check) {
+    if (!compress_per_hop && edgelist_hops) {
+      rmm::device_uvector<vertex_t> min_vertices(num_labels * num_hops, handle.get_stream());
+      rmm::device_uvector<vertex_t> max_vertices(min_vertices.size(), handle.get_stream());
+
+      auto label_index_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        optionally_compute_label_index_t<label_index_t>{
+          edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                 : thrust::nullopt});
+      auto input_key_first =
+        thrust::make_zip_iterator(label_index_first, std::get<0>(*edgelist_hops).begin());
+      rmm::device_uvector<label_index_t> unique_key_label_indices(min_vertices.size(),
+                                                                  handle.get_stream());
+      rmm::device_uvector<int32_t> unique_key_hops(min_vertices.size(), handle.get_stream());
+      auto output_key_first =
+        thrust::make_zip_iterator(unique_key_label_indices.begin(), unique_key_hops.begin());
+
+      auto output_it =
+        thrust::reduce_by_key(handle.get_thrust_policy(),
+                              input_key_first,
+                              input_key_first + edgelist_majors.size(),
+                              edgelist_majors.begin(),
+                              output_key_first,
+                              min_vertices.begin(),
+                              thrust::equal_to<thrust::tuple<label_index_t, int32_t>>{},
+                              thrust::minimum<vertex_t>{});
+      auto num_unique_keys =
+        static_cast<size_t>(thrust::distance(output_key_first, thrust::get<0>(output_it)));
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            edgelist_majors.begin(),
+                            output_key_first,
+                            max_vertices.begin(),
+                            thrust::equal_to<thrust::tuple<label_index_t, int32_t>>{},
+                            thrust::maximum<vertex_t>{});
+      if (num_unique_keys > 1) {
+        auto num_invalids = thrust::count_if(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{1}),
+          thrust::make_counting_iterator(num_unique_keys),
+          [output_key_first,
+           min_vertices = raft::device_span<vertex_t const>(min_vertices.data(), num_unique_keys),
+           max_vertices = raft::device_span<vertex_t const>(max_vertices.data(),
+                                                            num_unique_keys)] __device__(size_t i) {
+            auto prev_key = *(output_key_first + (i - 1));
+            auto this_key = *(output_key_first + i);
+            if (thrust::get<0>(prev_key) == thrust::get<0>(this_key)) {
+              auto this_min = min_vertices[i];
+              auto prev_max = max_vertices[i - 1];
+              return prev_max >= this_min;
+            } else {
+              return false;
+            }
+          });
+        CUGRAPH_EXPECTS(num_invalids == 0,
+                        "Invalid input arguments: if @p compress_per_hop is false and @p "
+                        "edgelist_hops.has_value() is true, the minimum majors with hop N + 1 "
+                        "should be larger than the maximum majors with hop N after renumbering.");
+      }
+    }
+  }
+
+  // 4. compute offsets for ((l), (h), major) triplets with non zero neighbors (update
+  // compressed_label_indices, compressed_hops, compressed_nzd_vertices, and compressed_offsets)
+
+  auto num_uniques = thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator(size_t{0}),
+    thrust::make_counting_iterator(edgelist_majors.size()),
+    is_first_in_run_t<vertex_t>{
+      edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                             : thrust::nullopt,
+      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                        std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                    : thrust::nullopt,
+      raft::device_span<vertex_t const>(
+        edgelist_majors.data(),
+        edgelist_majors.size())});  // number of unique ((label), (hop), major) triplets
+
+  auto compressed_label_indices =
+    edgelist_label_offsets
+      ? std::make_optional<rmm::device_uvector<label_index_t>>(num_uniques, handle.get_stream())
+      : std::nullopt;
+  auto compressed_hops = edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                                           num_uniques, handle.get_stream())
+                                       : std::nullopt;
+  rmm::device_uvector<vertex_t> compressed_nzd_vertices(num_uniques, handle.get_stream());
+  rmm::device_uvector<size_t> compressed_offsets(num_uniques + 1, handle.get_stream());
+  compressed_offsets.set_element_to_zero_async(num_uniques, handle.get_stream());
+
+  if (edgelist_label_offsets) {
+    auto label_index_first = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(size_t{0}),
+      compute_label_index_t<label_index_t>{std::get<0>(*edgelist_label_offsets)});
+
+    if (edgelist_hops) {
+      auto input_key_first = thrust::make_zip_iterator(
+        label_index_first, std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+      auto output_key_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                        (*compressed_hops).begin(),
+                                                        compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    } else {
+      auto input_key_first  = thrust::make_zip_iterator(label_index_first, edgelist_majors.begin());
+      auto output_key_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                        compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    }
+  } else {
+    if (edgelist_hops) {
+      auto input_key_first =
+        thrust::make_zip_iterator(std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+      auto output_key_first =
+        thrust::make_zip_iterator((*compressed_hops).begin(), compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    } else {
+      auto input_key_first  = edgelist_majors.begin();
+      auto output_key_first = compressed_nzd_vertices.begin();
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    }
+  }
+  thrust::exclusive_scan(handle.get_thrust_policy(),
+                         compressed_offsets.begin(),
+                         compressed_offsets.end(),
+                         compressed_offsets.begin());
+
+  // 5. update compressed_offsets to include zero degree vertices (if doubly_compress is false) and
+  // compressed_offset_label_hop_offsets (if edgelist_label_offsets.has_value() or
+  // edgelist_hops.has_value() is true)
+
+  std::optional<rmm::device_uvector<size_t>> compressed_offset_label_hop_offsets{std::nullopt};
+  if (doubly_compress) {
+    if (edgelist_label_offsets || edgelist_hops) {
+      rmm::device_uvector<size_t> offset_array_offsets(num_labels * num_hops + 1,
+                                                       handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   offset_array_offsets.begin(),
+                   offset_array_offsets.end(),
+                   size_t{0});
+
+      if (edgelist_label_offsets) {
+        if (edgelist_hops) {
+          auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                      (*compressed_hops).begin());
+          auto value_pair_first = thrust::make_transform_iterator(
+            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
+              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                        static_cast<int32_t>(i % num_hops));
+            });
+          thrust::upper_bound(handle.get_thrust_policy(),
+                              pair_first,
+                              pair_first + (*compressed_label_indices).size(),
+                              value_pair_first,
+                              value_pair_first + (num_labels * num_hops),
+                              offset_array_offsets.begin() + 1);
+        } else {
+          thrust::upper_bound(
+            handle.get_thrust_policy(),
+            (*compressed_label_indices).begin(),
+            (*compressed_label_indices).end(),
+            thrust::make_counting_iterator(label_index_t{0}),
+            thrust::make_counting_iterator(static_cast<label_index_t>(num_labels)),
+            offset_array_offsets.begin() + 1);
+        }
+      } else {
+        thrust::upper_bound(handle.get_thrust_policy(),
+                            (*compressed_hops).begin(),
+                            (*compressed_hops).end(),
+                            thrust::make_counting_iterator(int32_t{0}),
+                            thrust::make_counting_iterator(static_cast<int32_t>(num_hops)),
+                            offset_array_offsets.begin() + 1);
+      }
+
+      compressed_offset_label_hop_offsets = std::move(offset_array_offsets);
+    }
+  } else {  // !doubly_compress
+    rmm::device_uvector<vertex_t> major_vertex_counts(num_labels * num_hops, handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      major_vertex_counts.begin(),
+      major_vertex_counts.end(),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       edgelist_majors =
+         raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+       num_hops,
+       compress_per_hop] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset         = edgelist_majors.size();
+        auto label_start_offset = start_offset;
+        auto label_end_offset   = end_offset;
+
+        if (edgelist_label_offsets) {
+          auto l_idx         = static_cast<label_index_t>(i / num_hops);
+          start_offset       = (*edgelist_label_offsets)[l_idx];
+          end_offset         = (*edgelist_label_offsets)[l_idx + 1];
+          label_start_offset = start_offset;
+          label_end_offset   = end_offset;
+        }
+
+        if (num_hops > 1) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+        if (compress_per_hop) {
+          return (start_offset < end_offset) ? (edgelist_majors[end_offset - 1] + 1) : vertex_t{0};
+        } else {
+          if (end_offset != label_end_offset) {
+            return edgelist_majors[end_offset];
+          } else if (label_start_offset < label_end_offset) {
+            return edgelist_majors[end_offset - 1] + 1;
+          } else {
+            return vertex_t{0};
+          }
+        }
+      });
+
+    std::optional<rmm::device_uvector<vertex_t>> minor_vertex_counts{std::nullopt};
+    if (compress_per_hop) {
+      minor_vertex_counts =
+        rmm::device_uvector<vertex_t>(major_vertex_counts.size(), handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   (*minor_vertex_counts).begin(),
+                   (*minor_vertex_counts).end(),
+                   vertex_t{0});
+      if (edgelist_label_offsets) {
+        auto triplet_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                       (*compressed_hops).begin(),
+                                                       thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(handle.get_thrust_policy(),
+                         triplet_first,
+                         triplet_first + compressed_nzd_vertices.size(),
+                         [edgelist_minors = raft::device_span<vertex_t const>(
+                            edgelist_minors.data(), edgelist_minors.size()),
+                          compressed_offsets = raft::device_span<size_t const>(
+                            compressed_offsets.data(), compressed_offsets.size()),
+                          minor_vertex_counts = raft::device_span<vertex_t>(
+                            (*minor_vertex_counts).data(), (*minor_vertex_counts).size()),
+                          num_hops] __device__(auto triplet) {
+                           auto nzd_v_idx    = thrust::get<2>(triplet);
+                           size_t end_offset = compressed_offsets[nzd_v_idx + 1];
+                           auto l_idx        = thrust::get<0>(triplet);
+                           auto h            = thrust::get<1>(triplet);
+                           cuda::atomic_ref<vertex_t, cuda::thread_scope_device> minor_vertex_count(
+                             minor_vertex_counts[l_idx * num_hops + h]);
+                           minor_vertex_count.fetch_max(edgelist_minors[end_offset - 1] + 1,
+                                                        cuda::std::memory_order_relaxed);
+                         });
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*compressed_hops).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(handle.get_thrust_policy(),
+                         pair_first,
+                         pair_first + compressed_nzd_vertices.size(),
+                         [edgelist_minors = raft::device_span<vertex_t const>(
+                            edgelist_minors.data(), edgelist_minors.size()),
+                          compressed_offsets = raft::device_span<size_t const>(
+                            compressed_offsets.data(), compressed_offsets.size()),
+                          minor_vertex_counts = raft::device_span<vertex_t>(
+                            (*minor_vertex_counts).data(), (*minor_vertex_counts).size()),
+                          num_hops] __device__(auto pair) {
+                           auto nzd_v_idx    = thrust::get<1>(pair);
+                           size_t end_offset = compressed_offsets[nzd_v_idx + 1];
+                           auto h            = thrust::get<0>(pair);
+                           cuda::atomic_ref<vertex_t, cuda::thread_scope_device> minor_vertex_count(
+                             minor_vertex_counts[h]);
+                           minor_vertex_count.fetch_max(edgelist_minors[end_offset - 1] + 1,
+                                                        cuda::std::memory_order_relaxed);
+                         });
+      }
+    }
+
+    rmm::device_uvector<size_t> offset_array_offsets(num_labels * num_hops + 1,
+                                                     handle.get_stream());
+    offset_array_offsets.set_element_to_zero_async(num_labels * num_hops, handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      offset_array_offsets.begin(),
+      offset_array_offsets.begin() + (num_labels * num_hops),
+      [major_vertex_counts =
+         raft::device_span<vertex_t const>(major_vertex_counts.data(), major_vertex_counts.size()),
+       minor_vertex_counts = minor_vertex_counts
+                               ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                                   (*minor_vertex_counts).data(), (*minor_vertex_counts).size())
+                               : thrust::nullopt,
+       num_hops,
+       compress_per_hop] __device__(size_t i) {
+        auto vertex_count = major_vertex_counts[i];
+        if (num_hops > 1) {
+          if (compress_per_hop) {
+            for (size_t j = (i - (i % num_hops)); j < i; ++j) {
+              vertex_count = cuda::std::max(vertex_count, major_vertex_counts[j]);
+              vertex_count = cuda::std::max(vertex_count, (*minor_vertex_counts)[j]);
+            }
+          } else {
+            if (i % num_hops != 0) { vertex_count -= major_vertex_counts[i - 1]; }
+          }
+        }
+        return vertex_count;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           offset_array_offsets.begin(),
+                           offset_array_offsets.end(),
+                           offset_array_offsets.begin());
+
+    auto tmp_compressed_offsets = rmm::device_uvector<size_t>(
+      offset_array_offsets.back_element(handle.get_stream()) + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 tmp_compressed_offsets.begin(),
+                 tmp_compressed_offsets.end(),
+                 size_t{0});
+
+    if (edgelist_label_offsets) {
+      if (edgelist_hops) {
+        auto triplet_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                       (*compressed_hops).begin(),
+                                                       thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          triplet_first,
+          triplet_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                                              tmp_compressed_offsets.size()),
+           compress_per_hop,
+           num_hops] __device__(auto triplet) {
+            auto nzd_v_idx      = thrust::get<2>(triplet);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto l_idx          = thrust::get<0>(triplet);
+            auto h              = thrust::get<1>(triplet);
+            tmp_compressed_offsets[offset_array_offsets[l_idx * num_hops +
+                                                        (compress_per_hop ? h : int32_t{0})] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          pair_first,
+          pair_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(
+             tmp_compressed_offsets.data(), tmp_compressed_offsets.size())] __device__(auto pair) {
+            auto nzd_v_idx      = thrust::get<1>(pair);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto l_idx          = thrust::get<0>(pair);
+            tmp_compressed_offsets[offset_array_offsets[l_idx] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      }
+    } else {
+      if (edgelist_hops) {
+        auto pair_first = thrust::make_zip_iterator((*compressed_hops).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          pair_first,
+          pair_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                                              tmp_compressed_offsets.size()),
+           compress_per_hop] __device__(auto pair) {
+            auto nzd_v_idx      = thrust::get<1>(pair);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto h              = thrust::get<0>(pair);
+            tmp_compressed_offsets[offset_array_offsets[compress_per_hop ? h : int32_t{0}] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      } else {
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(compressed_nzd_vertices.size()),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets =
+             raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                       tmp_compressed_offsets.size())] __device__(auto nzd_v_idx) {
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            tmp_compressed_offsets[compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      }
+    }
+
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           tmp_compressed_offsets.begin(),
+                           tmp_compressed_offsets.end(),
+                           tmp_compressed_offsets.begin());
+
+    compressed_offsets = std::move(tmp_compressed_offsets);
+
+    if (edgelist_label_offsets || edgelist_hops) {
+      compressed_offset_label_hop_offsets = std::move(offset_array_offsets);
+    }
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(
+    doubly_compress ? std::make_optional(std::move(compressed_nzd_vertices)) : std::nullopt,
+    std::move(compressed_offsets),
+    std::move(edgelist_minors),
+    std::move(edgelist_weights),
+    std::move(edgelist_edge_ids),
+    std::move(edgelist_edge_types),
+    std::move(compressed_offset_label_hop_offsets),
+    std::move(renumber_map),
+    std::move(renumber_map_label_offsets));
+}
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,       // (label, hop) offsets to the edges
+           rmm::device_uvector<vertex_t>,                    // renumber map
+           std::optional<rmm::device_uvector<size_t>>>       // label offsets to the renumber map
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  // 2. renumber
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
+  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+    renumber_sampled_edgelist<vertex_t, label_index_t>(
+      handle,
+      std::move(edgelist_majors),
+      std::move(edgelist_minors),
+      edgelist_hops ? std::make_optional(std::make_tuple(
+                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
+                                                         std::get<0>(*edgelist_hops).size()),
+                        num_hops))
+                    : std::nullopt,
+      edgelist_label_offsets,
+      do_expensive_check);
+
+  // 3. sort by ((l), (h), major, minor)
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  // 4. compute edgelist_label_hop_offsets
+
+  std::optional<rmm::device_uvector<size_t>> edgelist_label_hop_offsets{std::nullopt};
+  if (edgelist_label_offsets || edgelist_hops) {
+    edgelist_label_hop_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_hops + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*edgelist_label_hop_offsets).begin(),
+                 (*edgelist_label_hop_offsets).end(),
+                 size_t{0});
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_labels * num_hops),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       num_hops,
+       num_edges = edgelist_majors.size()] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset = num_edges;
+
+        if (edgelist_label_offsets) {
+          auto l_idx   = static_cast<label_index_t>(i / num_hops);
+          start_offset = (*edgelist_label_offsets)[l_idx];
+          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+        }
+
+        if (edgelist_hops) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+
+        return end_offset - start_offset;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           (*edgelist_label_hop_offsets).begin(),
+                           (*edgelist_label_hop_offsets).end(),
+                           (*edgelist_label_hop_offsets).begin());
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
+                         std::move(src_is_major ? edgelist_minors : edgelist_majors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_label_hop_offsets),
+                         std::move(renumber_map),
+                         std::move(renumber_map_label_offsets));
+}
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  // 2. sort by ((l), (h), major, minor)
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  // 3. compute edgelist_label_hop_offsets
+
+  std::optional<rmm::device_uvector<size_t>> edgelist_label_hop_offsets{std::nullopt};
+  if (edgelist_label_offsets || edgelist_hops) {
+    edgelist_label_hop_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_hops + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*edgelist_label_hop_offsets).begin(),
+                 (*edgelist_label_hop_offsets).end(),
+                 size_t{0});
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_labels * num_hops),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       num_hops,
+       num_edges = edgelist_majors.size()] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset = num_edges;
+
+        if (edgelist_label_offsets) {
+          auto l_idx   = static_cast<label_index_t>(i / num_hops);
+          start_offset = (*edgelist_label_offsets)[l_idx];
+          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+        }
+
+        if (edgelist_hops) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+
+        return end_offset - start_offset;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           (*edgelist_label_hop_offsets).begin(),
+                           (*edgelist_label_hop_offsets).end(),
+                           (*edgelist_label_hop_offsets).begin());
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
+                         std::move(src_is_major ? edgelist_minors : edgelist_majors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_label_hop_offsets));
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/sampling_post_processing_sg.cu b/cpp/src/sampling/sampling_post_processing_sg.cu
new file mode 100644
index 00000000000..75e3c5f005a
--- /dev/null
+++ b/cpp/src/sampling/sampling_post_processing_sg.cu
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/sampling_functions.hpp>
+
+#include "sampling_post_processing_impl.cuh"
+
+namespace cugraph {
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int64_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int64_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eebd31a0030..5e1e1d6ace3 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -390,9 +390,9 @@ ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/sg_uniform_neighbor_sampli
 target_link_libraries(UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
-# - RENUMBER SAMPLED EDGE LIST tests --------------------------------------------------------------
-ConfigureTest(RENUMBER_SAMPLED_EDGELIST_TEST sampling/renumber_sampled_edgelist_test.cu)
-target_link_libraries(RENUMBER_SAMPLED_EDGELIST_TEST PRIVATE cuco::cuco)
+# - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
+ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu)
+target_link_libraries(SAMPLING_POST_PROCESSING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
 # - Renumber tests --------------------------------------------------------------------------------
diff --git a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu b/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
deleted file mode 100644
index 96c8d6173e7..00000000000
--- a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <utilities/base_fixture.hpp>
-
-#include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/graph_functions.hpp>
-#include <cugraph/utilities/device_functors.cuh>
-#include <cugraph/utilities/high_res_timer.hpp>
-
-#include <raft/core/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <gtest/gtest.h>
-
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-#include <thrust/fill.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-struct RenumberSampledEdgelist_Usecase {
-  size_t num_vertices{};
-  size_t num_sampled_edges{};
-  size_t num_hops{1};    // enabled if larger than 1
-  size_t num_labels{1};  // enabled if larger than 1
-  bool check_correctness{true};
-};
-
-class Tests_RenumberSampledEdgelist
-  : public ::testing::TestWithParam<RenumberSampledEdgelist_Usecase> {
- public:
-  Tests_RenumberSampledEdgelist() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  template <typename vertex_t>
-  void run_current_test(RenumberSampledEdgelist_Usecase const& usecase)
-  {
-    using label_t = int32_t;
-
-    raft::handle_t handle{};
-    HighResTimer hr_timer{};
-
-    raft::random::RngState rng_state(0);
-
-    rmm::device_uvector<vertex_t> org_edgelist_srcs(usecase.num_sampled_edges, handle.get_stream());
-    rmm::device_uvector<vertex_t> org_edgelist_dsts(usecase.num_sampled_edges, handle.get_stream());
-    cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                         org_edgelist_srcs.data(),
-                                         org_edgelist_srcs.size(),
-                                         vertex_t{0},
-                                         static_cast<vertex_t>(usecase.num_vertices),
-                                         rng_state);
-    cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                         org_edgelist_dsts.data(),
-                                         org_edgelist_dsts.size(),
-                                         vertex_t{0},
-                                         static_cast<vertex_t>(usecase.num_vertices),
-                                         rng_state);
-
-    std::optional<rmm::device_uvector<int32_t>> edgelist_hops{std::nullopt};
-    if (usecase.num_hops > 1) {
-      edgelist_hops = rmm::device_uvector<int32_t>(usecase.num_sampled_edges, handle.get_stream());
-      cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                           (*edgelist_hops).data(),
-                                           (*edgelist_hops).size(),
-                                           int32_t{0},
-                                           static_cast<int32_t>(usecase.num_hops),
-                                           rng_state);
-    }
-
-    std::optional<std::tuple<rmm::device_uvector<label_t>, rmm::device_uvector<size_t>>>
-      label_offsets{std::nullopt};
-    if (usecase.num_labels > 1) {
-      rmm::device_uvector<label_t> labels(usecase.num_labels, handle.get_stream());
-      thrust::sequence(handle.get_thrust_policy(), labels.begin(), labels.end(), label_t{0});
-
-      rmm::device_uvector<label_t> edgelist_labels(usecase.num_sampled_edges, handle.get_stream());
-      cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                           edgelist_labels.data(),
-                                           edgelist_labels.size(),
-                                           label_t{0},
-                                           static_cast<label_t>(usecase.num_labels),
-                                           rng_state);
-
-      rmm::device_uvector<size_t> offsets(usecase.num_labels + 1, handle.get_stream());
-      thrust::fill(handle.get_thrust_policy(), offsets.begin(), offsets.end(), size_t{0});
-
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        edgelist_labels.begin(),
-        edgelist_labels.end(),
-        [offsets =
-           raft::device_span<size_t>(offsets.data(), offsets.size())] __device__(label_t label) {
-          cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(offsets[label]);
-          atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
-        });
-
-      thrust::exclusive_scan(
-        handle.get_thrust_policy(), offsets.begin(), offsets.end(), offsets.begin());
-
-      label_offsets = std::make_tuple(std::move(labels), std::move(offsets));
-    }
-
-    rmm::device_uvector<vertex_t> renumbered_edgelist_srcs(org_edgelist_srcs.size(),
-                                                           handle.get_stream());
-    rmm::device_uvector<vertex_t> renumbered_edgelist_dsts(org_edgelist_dsts.size(),
-                                                           handle.get_stream());
-    thrust::copy(handle.get_thrust_policy(),
-                 org_edgelist_srcs.begin(),
-                 org_edgelist_srcs.end(),
-                 renumbered_edgelist_srcs.begin());
-    thrust::copy(handle.get_thrust_policy(),
-                 org_edgelist_dsts.begin(),
-                 org_edgelist_dsts.end(),
-                 renumbered_edgelist_dsts.begin());
-
-    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
-    std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.start("Renumber sampled edgelist");
-    }
-
-    std::tie(renumbered_edgelist_srcs,
-             renumbered_edgelist_dsts,
-             renumber_map,
-             renumber_map_label_offsets) =
-      cugraph::renumber_sampled_edgelist(
-        handle,
-        std::move(renumbered_edgelist_srcs),
-        std::move(renumbered_edgelist_dsts),
-        edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
-                          (*edgelist_hops).data(), (*edgelist_hops).size())
-                      : std::nullopt,
-        label_offsets
-          ? std::make_optional<
-              std::tuple<raft::device_span<label_t const>, raft::device_span<size_t const>>>(
-              std::make_tuple(raft::device_span<label_t const>(std::get<0>(*label_offsets).data(),
-                                                               std::get<0>(*label_offsets).size()),
-                              raft::device_span<size_t const>(std::get<1>(*label_offsets).data(),
-                                                              std::get<1>(*label_offsets).size())))
-          : std::nullopt);
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.stop();
-      hr_timer.display_and_clear(std::cout);
-    }
-
-    if (usecase.check_correctness) {
-      for (size_t i = 0; i < usecase.num_labels; ++i) {
-        size_t edgelist_start_offset =
-          label_offsets ? std::get<1>(*label_offsets).element(i, handle.get_stream()) : size_t{0};
-        size_t edgelist_end_offset =
-          label_offsets ? std::get<1>(*label_offsets).element(i + 1, handle.get_stream())
-                        : usecase.num_sampled_edges;
-        if (edgelist_start_offset == edgelist_end_offset) continue;
-
-        auto this_label_org_edgelist_srcs =
-          raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_org_edgelist_dsts =
-          raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_edgelist_hops = edgelist_hops
-                                          ? std::make_optional<raft::device_span<int32_t const>>(
-                                              (*edgelist_hops).data() + edgelist_start_offset,
-                                              edgelist_end_offset - edgelist_start_offset)
-                                          : std::nullopt;
-        auto this_label_renumbered_edgelist_srcs =
-          raft::device_span<vertex_t const>(renumbered_edgelist_srcs.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_renumbered_edgelist_dsts =
-          raft::device_span<vertex_t const>(renumbered_edgelist_dsts.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-
-        size_t renumber_map_start_offset =
-          renumber_map_label_offsets ? (*renumber_map_label_offsets).element(i, handle.get_stream())
-                                     : size_t{0};
-        size_t renumber_map_end_offset =
-          renumber_map_label_offsets
-            ? (*renumber_map_label_offsets).element(i + 1, handle.get_stream())
-            : renumber_map.size();
-        auto this_label_renumber_map =
-          raft::device_span<vertex_t const>(renumber_map.data() + renumber_map_start_offset,
-                                            renumber_map_end_offset - renumber_map_start_offset);
-
-        // check un-renumbering recovers the original edge list
-
-        auto pair_first = thrust::make_zip_iterator(this_label_org_edgelist_srcs.begin(),
-                                                    this_label_renumbered_edgelist_srcs.begin());
-        auto num_renumber_errors =
-          thrust::count_if(handle.get_thrust_policy(),
-                           pair_first,
-                           pair_first + this_label_org_edgelist_srcs.size(),
-                           [this_label_renumber_map] __device__(auto pair) {
-                             auto org        = thrust::get<0>(pair);
-                             auto renumbered = thrust::get<1>(pair);
-                             return this_label_renumber_map[renumbered] != org;
-                           });
-        ASSERT_TRUE(num_renumber_errors == 0) << "Renumber error in edge list sources.";
-
-        pair_first          = thrust::make_zip_iterator(this_label_org_edgelist_dsts.begin(),
-                                               this_label_renumbered_edgelist_dsts.begin());
-        num_renumber_errors = thrust::count_if(handle.get_thrust_policy(),
-                                               pair_first,
-                                               pair_first + this_label_org_edgelist_dsts.size(),
-                                               [this_label_renumber_map] __device__(auto pair) {
-                                                 auto org        = thrust::get<0>(pair);
-                                                 auto renumbered = thrust::get<1>(pair);
-                                                 return this_label_renumber_map[renumbered] != org;
-                                               });
-        ASSERT_TRUE(num_renumber_errors == 0) << "Renumber error in edge list destinations.";
-
-        // Check the invariants in renumber_map
-        // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique
-        // vertices, where flag is 0 for sources and 1 for destinations. Then, vertices with smaller
-        // (hop, flag) pairs should be renumbered to smaller numbers than vertices with larger (hop,
-        // flag) pairs.
-
-        rmm::device_uvector<vertex_t> unique_srcs(this_label_org_edgelist_srcs.size(),
-                                                  handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_org_edgelist_srcs.begin(),
-                     this_label_org_edgelist_srcs.end(),
-                     unique_srcs.begin());
-        std::optional<rmm::device_uvector<int32_t>> unique_src_hops =
-          this_label_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                                       (*this_label_edgelist_hops).size(), handle.get_stream())
-                                   : std::nullopt;
-        if (this_label_edgelist_hops) {
-          thrust::copy(handle.get_thrust_policy(),
-                       (*this_label_edgelist_hops).begin(),
-                       (*this_label_edgelist_hops).end(),
-                       (*unique_src_hops).begin());
-
-          auto pair_first =
-            thrust::make_zip_iterator(unique_srcs.begin(), (*unique_src_hops).begin());
-          thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_srcs.size());
-          unique_srcs.resize(
-            thrust::distance(unique_srcs.begin(),
-                             thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                                  unique_srcs.begin(),
-                                                                  unique_srcs.end(),
-                                                                  (*unique_src_hops).begin()))),
-            handle.get_stream());
-          (*unique_src_hops).resize(unique_srcs.size(), handle.get_stream());
-        } else {
-          thrust::sort(handle.get_thrust_policy(), unique_srcs.begin(), unique_srcs.end());
-          unique_srcs.resize(
-            thrust::distance(
-              unique_srcs.begin(),
-              thrust::unique(handle.get_thrust_policy(), unique_srcs.begin(), unique_srcs.end())),
-            handle.get_stream());
-        }
-
-        rmm::device_uvector<vertex_t> unique_dsts(this_label_org_edgelist_dsts.size(),
-                                                  handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_org_edgelist_dsts.begin(),
-                     this_label_org_edgelist_dsts.end(),
-                     unique_dsts.begin());
-        std::optional<rmm::device_uvector<int32_t>> unique_dst_hops =
-          this_label_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                                       (*this_label_edgelist_hops).size(), handle.get_stream())
-                                   : std::nullopt;
-        if (this_label_edgelist_hops) {
-          thrust::copy(handle.get_thrust_policy(),
-                       (*this_label_edgelist_hops).begin(),
-                       (*this_label_edgelist_hops).end(),
-                       (*unique_dst_hops).begin());
-
-          auto pair_first =
-            thrust::make_zip_iterator(unique_dsts.begin(), (*unique_dst_hops).begin());
-          thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_dsts.size());
-          unique_dsts.resize(
-            thrust::distance(unique_dsts.begin(),
-                             thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                                  unique_dsts.begin(),
-                                                                  unique_dsts.end(),
-                                                                  (*unique_dst_hops).begin()))),
-            handle.get_stream());
-          (*unique_dst_hops).resize(unique_dsts.size(), handle.get_stream());
-        } else {
-          thrust::sort(handle.get_thrust_policy(), unique_dsts.begin(), unique_dsts.end());
-          unique_dsts.resize(
-            thrust::distance(
-              unique_dsts.begin(),
-              thrust::unique(handle.get_thrust_policy(), unique_dsts.begin(), unique_dsts.end())),
-            handle.get_stream());
-        }
-
-        rmm::device_uvector<vertex_t> sorted_org_vertices(this_label_renumber_map.size(),
-                                                          handle.get_stream());
-        rmm::device_uvector<vertex_t> matching_renumbered_vertices(sorted_org_vertices.size(),
-                                                                   handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_renumber_map.begin(),
-                     this_label_renumber_map.end(),
-                     sorted_org_vertices.begin());
-        thrust::sequence(handle.get_thrust_policy(),
-                         matching_renumbered_vertices.begin(),
-                         matching_renumbered_vertices.end(),
-                         vertex_t{0});
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            sorted_org_vertices.begin(),
-                            sorted_org_vertices.end(),
-                            matching_renumbered_vertices.begin());
-
-        if (this_label_edgelist_hops) {
-          rmm::device_uvector<vertex_t> merged_vertices(unique_srcs.size() + unique_dsts.size(),
-                                                        handle.get_stream());
-          rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
-          rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
-
-          auto src_triplet_first =
-            thrust::make_zip_iterator(unique_srcs.begin(),
-                                      (*unique_src_hops).begin(),
-                                      thrust::make_constant_iterator(int8_t{0}));
-          auto dst_triplet_first =
-            thrust::make_zip_iterator(unique_dsts.begin(),
-                                      (*unique_dst_hops).begin(),
-                                      thrust::make_constant_iterator(int8_t{1}));
-          thrust::merge(handle.get_thrust_policy(),
-                        src_triplet_first,
-                        src_triplet_first + unique_srcs.size(),
-                        dst_triplet_first,
-                        dst_triplet_first + unique_dsts.size(),
-                        thrust::make_zip_iterator(
-                          merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
-          merged_vertices.resize(
-            thrust::distance(
-              merged_vertices.begin(),
-              thrust::get<0>(thrust::unique_by_key(
-                handle.get_thrust_policy(),
-                merged_vertices.begin(),
-                merged_vertices.end(),
-                thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
-            handle.get_stream());
-          merged_hops.resize(merged_vertices.size(), handle.get_stream());
-          merged_flags.resize(merged_vertices.size(), handle.get_stream());
-
-          auto sort_key_first =
-            thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
-          thrust::sort_by_key(handle.get_thrust_policy(),
-                              sort_key_first,
-                              sort_key_first + merged_hops.size(),
-                              merged_vertices.begin());
-
-          auto num_unique_keys = thrust::count_if(
-            handle.get_thrust_policy(),
-            thrust::make_counting_iterator(size_t{0}),
-            thrust::make_counting_iterator(merged_hops.size()),
-            cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
-          rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
-          rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
-
-          auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
-            merged_vertices.begin(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t src) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            });
-
-          thrust::reduce_by_key(handle.get_thrust_policy(),
-                                sort_key_first,
-                                sort_key_first + merged_hops.size(),
-                                renumbered_merged_vertex_first,
-                                thrust::make_discard_iterator(),
-                                min_vertices.begin(),
-                                thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                                thrust::minimum<vertex_t>{});
-          thrust::reduce_by_key(handle.get_thrust_policy(),
-                                sort_key_first,
-                                sort_key_first + merged_hops.size(),
-                                renumbered_merged_vertex_first,
-                                thrust::make_discard_iterator(),
-                                max_vertices.begin(),
-                                thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                                thrust::maximum<vertex_t>{});
-
-          auto num_violations =
-            thrust::count_if(handle.get_thrust_policy(),
-                             thrust::make_counting_iterator(size_t{1}),
-                             thrust::make_counting_iterator(min_vertices.size()),
-                             [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(),
-                                                                               min_vertices.size()),
-                              max_vertices = raft::device_span<vertex_t const>(
-                                max_vertices.data(), max_vertices.size())] __device__(size_t i) {
-                               return min_vertices[i] <= max_vertices[i - 1];
-                             });
-
-          ASSERT_TRUE(num_violations == 0)
-            << "Invariant violated, a vertex with a smaller (hop,flag) pair is renumbered to a "
-               "larger value than a vertex with a larger (hop, flag) pair.";
-        } else {
-          unique_dsts.resize(
-            thrust::distance(
-              unique_dsts.begin(),
-              thrust::remove_if(handle.get_thrust_policy(),
-                                unique_dsts.begin(),
-                                unique_dsts.end(),
-                                [sorted_unique_srcs = raft::device_span<vertex_t const>(
-                                   unique_srcs.data(), unique_srcs.size())] __device__(auto dst) {
-                                  return thrust::binary_search(thrust::seq,
-                                                               sorted_unique_srcs.begin(),
-                                                               sorted_unique_srcs.end(),
-                                                               dst);
-                                })),
-            handle.get_stream());
-
-          auto max_src_renumbered_vertex = thrust::transform_reduce(
-            handle.get_thrust_policy(),
-            unique_srcs.begin(),
-            unique_srcs.end(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t src) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            },
-            std::numeric_limits<vertex_t>::lowest(),
-            thrust::maximum<vertex_t>{});
-
-          auto min_dst_renumbered_vertex = thrust::transform_reduce(
-            handle.get_thrust_policy(),
-            unique_dsts.begin(),
-            unique_dsts.end(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t dst) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), dst);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            },
-            std::numeric_limits<vertex_t>::max(),
-            thrust::minimum<vertex_t>{});
-
-          ASSERT_TRUE(max_src_renumbered_vertex < min_dst_renumbered_vertex)
-            << "Invariants violated, a source vertex is renumbered to a non-smaller value than a "
-               "vertex that appear only in the edge list destinations.";
-        }
-      }
-    }
-  }
-};
-
-TEST_P(Tests_RenumberSampledEdgelist, CheckInt32)
-{
-  auto param = GetParam();
-  run_current_test<int32_t>(param);
-}
-
-TEST_P(Tests_RenumberSampledEdgelist, CheckInt64)
-{
-  auto param = GetParam();
-  run_current_test<int64_t>(param);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-  small_test,
-  Tests_RenumberSampledEdgelist,
-  ::testing::Values(RenumberSampledEdgelist_Usecase{1024, 4096, 1, 1, true},
-                    RenumberSampledEdgelist_Usecase{1024, 4096, 3, 1, true},
-                    RenumberSampledEdgelist_Usecase{1024, 32768, 1, 256, true},
-                    RenumberSampledEdgelist_Usecase{1024, 32768, 3, 256, true}));
-
-INSTANTIATE_TEST_SUITE_P(
-  benchmark_test,
-  Tests_RenumberSampledEdgelist,
-  ::testing::Values(RenumberSampledEdgelist_Usecase{1 << 20, 1 << 20, 1, 1, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 20, 5, 1, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 24, 1, 1 << 20, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 24, 5, 1 << 20, false}));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
new file mode 100644
index 00000000000..422fe953b20
--- /dev/null
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -0,0 +1,1457 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/equal.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+struct SamplingPostProcessing_Usecase {
+  size_t num_labels{};
+  size_t num_seeds_per_label{};
+  std::vector<int32_t> fanouts{{-1}};
+  bool sample_with_replacement{false};
+
+  bool src_is_major{true};
+  bool compress_per_hop{false};
+  bool doubly_compress{false};
+  bool check_correctness{true};
+};
+
+template <typename vertex_t, typename weight_t>
+bool compare_edgelist(raft::handle_t const& handle,
+                      raft::device_span<vertex_t const> org_edgelist_srcs,
+                      raft::device_span<vertex_t const> org_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
+                      raft::device_span<vertex_t const> renumbered_edgelist_srcs,
+                      raft::device_span<vertex_t const> renumbered_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
+                      std::optional<raft::device_span<vertex_t const>> renumber_map)
+{
+  if (org_edgelist_srcs.size() != renumbered_edgelist_srcs.size()) { return false; }
+
+  rmm::device_uvector<vertex_t> sorted_org_edgelist_srcs(org_edgelist_srcs.size(),
+                                                         handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_srcs.begin(),
+               org_edgelist_srcs.end(),
+               sorted_org_edgelist_srcs.begin());
+  rmm::device_uvector<vertex_t> sorted_org_edgelist_dsts(org_edgelist_dsts.size(),
+                                                         handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_dsts.begin(),
+               org_edgelist_dsts.end(),
+               sorted_org_edgelist_dsts.begin());
+  auto sorted_org_edgelist_weights = org_edgelist_weights
+                                       ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                           (*org_edgelist_weights).size(), handle.get_stream())
+                                       : std::nullopt;
+  if (sorted_org_edgelist_weights) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_weights).begin(),
+                 (*org_edgelist_weights).end(),
+                 (*sorted_org_edgelist_weights).begin());
+  }
+
+  if (sorted_org_edgelist_weights) {
+    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
+                                                           sorted_org_edgelist_dsts.begin(),
+                                                           (*sorted_org_edgelist_weights).begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_org_edge_first,
+                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
+  } else {
+    auto sorted_org_edge_first =
+      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_org_edge_first,
+                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
+  }
+
+  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_srcs(renumbered_edgelist_srcs.size(),
+                                                                  handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumbered_edgelist_srcs.begin(),
+               renumbered_edgelist_srcs.end(),
+               sorted_unrenumbered_edgelist_srcs.begin());
+  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_dsts(renumbered_edgelist_dsts.size(),
+                                                                  handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumbered_edgelist_dsts.begin(),
+               renumbered_edgelist_dsts.end(),
+               sorted_unrenumbered_edgelist_dsts.begin());
+  auto sorted_unrenumbered_edgelist_weights =
+    renumbered_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                    (*renumbered_edgelist_weights).size(), handle.get_stream())
+                                : std::nullopt;
+  if (sorted_unrenumbered_edgelist_weights) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*renumbered_edgelist_weights).begin(),
+                 (*renumbered_edgelist_weights).end(),
+                 (*sorted_unrenumbered_edgelist_weights).begin());
+  }
+
+  if (renumber_map) {
+    cugraph::unrenumber_int_vertices<vertex_t, false>(
+      handle,
+      sorted_unrenumbered_edgelist_srcs.data(),
+      sorted_unrenumbered_edgelist_srcs.size(),
+      (*renumber_map).data(),
+      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
+    cugraph::unrenumber_int_vertices<vertex_t, false>(
+      handle,
+      sorted_unrenumbered_edgelist_dsts.data(),
+      sorted_unrenumbered_edgelist_dsts.size(),
+      (*renumber_map).data(),
+      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
+  }
+
+  if (sorted_unrenumbered_edgelist_weights) {
+    auto sorted_unrenumbered_edge_first =
+      thrust::make_zip_iterator(sorted_unrenumbered_edgelist_srcs.begin(),
+                                sorted_unrenumbered_edgelist_dsts.begin(),
+                                (*sorted_unrenumbered_edgelist_weights).begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_unrenumbered_edge_first,
+                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
+
+    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
+                                                           sorted_org_edgelist_dsts.begin(),
+                                                           (*sorted_org_edgelist_weights).begin());
+    return thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first);
+  } else {
+    auto sorted_unrenumbered_edge_first = thrust::make_zip_iterator(
+      sorted_unrenumbered_edgelist_srcs.begin(), sorted_unrenumbered_edgelist_dsts.begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_unrenumbered_edge_first,
+                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
+
+    auto sorted_org_edge_first =
+      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
+    return thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first);
+  }
+}
+
+template <typename vertex_t>
+bool check_renumber_map_invariants(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> org_edgelist_srcs,
+  raft::device_span<vertex_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  raft::device_span<vertex_t const> renumber_map,
+  bool src_is_major)
+{
+  // Check the invariants in renumber_map
+  // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique vertices,
+  // where flag is 0 for sources and 1 for destinations. Then, vertices with smaller (hop, flag)
+  // pairs should be renumbered to smaller numbers than vertices with larger (hop, flag) pairs.
+  auto org_edgelist_majors = src_is_major ? org_edgelist_srcs : org_edgelist_dsts;
+  auto org_edgelist_minors = src_is_major ? org_edgelist_dsts : org_edgelist_srcs;
+
+  rmm::device_uvector<vertex_t> unique_majors(org_edgelist_majors.size(), handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_majors.begin(),
+               org_edgelist_majors.end(),
+               unique_majors.begin());
+  std::optional<rmm::device_uvector<int32_t>> unique_major_hops =
+    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                          (*org_edgelist_hops).size(), handle.get_stream())
+                      : std::nullopt;
+  if (org_edgelist_hops) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_hops).begin(),
+                 (*org_edgelist_hops).end(),
+                 (*unique_major_hops).begin());
+
+    auto pair_first =
+      thrust::make_zip_iterator(unique_majors.begin(), (*unique_major_hops).begin());
+    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_majors.size());
+    unique_majors.resize(
+      thrust::distance(unique_majors.begin(),
+                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                            unique_majors.begin(),
+                                                            unique_majors.end(),
+                                                            (*unique_major_hops).begin()))),
+      handle.get_stream());
+    (*unique_major_hops).resize(unique_majors.size(), handle.get_stream());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end());
+    unique_majors.resize(
+      thrust::distance(
+        unique_majors.begin(),
+        thrust::unique(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end())),
+      handle.get_stream());
+  }
+
+  rmm::device_uvector<vertex_t> unique_minors(org_edgelist_minors.size(), handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_minors.begin(),
+               org_edgelist_minors.end(),
+               unique_minors.begin());
+  std::optional<rmm::device_uvector<int32_t>> unique_minor_hops =
+    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                          (*org_edgelist_hops).size(), handle.get_stream())
+                      : std::nullopt;
+  if (org_edgelist_hops) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_hops).begin(),
+                 (*org_edgelist_hops).end(),
+                 (*unique_minor_hops).begin());
+
+    auto pair_first =
+      thrust::make_zip_iterator(unique_minors.begin(), (*unique_minor_hops).begin());
+    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_minors.size());
+    unique_minors.resize(
+      thrust::distance(unique_minors.begin(),
+                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                            unique_minors.begin(),
+                                                            unique_minors.end(),
+                                                            (*unique_minor_hops).begin()))),
+      handle.get_stream());
+    (*unique_minor_hops).resize(unique_minors.size(), handle.get_stream());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end());
+    unique_minors.resize(
+      thrust::distance(
+        unique_minors.begin(),
+        thrust::unique(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end())),
+      handle.get_stream());
+  }
+
+  rmm::device_uvector<vertex_t> sorted_org_vertices(renumber_map.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> matching_renumbered_vertices(sorted_org_vertices.size(),
+                                                             handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumber_map.begin(),
+               renumber_map.end(),
+               sorted_org_vertices.begin());
+  thrust::sequence(handle.get_thrust_policy(),
+                   matching_renumbered_vertices.begin(),
+                   matching_renumbered_vertices.end(),
+                   vertex_t{0});
+  thrust::sort_by_key(handle.get_thrust_policy(),
+                      sorted_org_vertices.begin(),
+                      sorted_org_vertices.end(),
+                      matching_renumbered_vertices.begin());
+
+  if (org_edgelist_hops) {
+    rmm::device_uvector<vertex_t> merged_vertices(unique_majors.size() + unique_minors.size(),
+                                                  handle.get_stream());
+    rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+    rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
+
+    auto major_triplet_first = thrust::make_zip_iterator(unique_majors.begin(),
+                                                         (*unique_major_hops).begin(),
+                                                         thrust::make_constant_iterator(int8_t{0}));
+    auto minor_triplet_first = thrust::make_zip_iterator(unique_minors.begin(),
+                                                         (*unique_minor_hops).begin(),
+                                                         thrust::make_constant_iterator(int8_t{1}));
+    thrust::merge(handle.get_thrust_policy(),
+                  major_triplet_first,
+                  major_triplet_first + unique_majors.size(),
+                  minor_triplet_first,
+                  minor_triplet_first + unique_minors.size(),
+                  thrust::make_zip_iterator(
+                    merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
+    merged_vertices.resize(
+      thrust::distance(merged_vertices.begin(),
+                       thrust::get<0>(thrust::unique_by_key(
+                         handle.get_thrust_policy(),
+                         merged_vertices.begin(),
+                         merged_vertices.end(),
+                         thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+      handle.get_stream());
+    merged_hops.resize(merged_vertices.size(), handle.get_stream());
+    merged_flags.resize(merged_vertices.size(), handle.get_stream());
+
+    auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        sort_key_first,
+                        sort_key_first + merged_hops.size(),
+                        merged_vertices.begin());
+
+    auto num_unique_keys = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(merged_hops.size()),
+      cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
+    rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
+    rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
+
+    auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
+      merged_vertices.begin(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t major) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      });
+
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          renumbered_merged_vertex_first,
+                          thrust::make_discard_iterator(),
+                          min_vertices.begin(),
+                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                          thrust::minimum<vertex_t>{});
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          renumbered_merged_vertex_first,
+                          thrust::make_discard_iterator(),
+                          max_vertices.begin(),
+                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                          thrust::maximum<vertex_t>{});
+
+    auto num_violations = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{1}),
+      thrust::make_counting_iterator(min_vertices.size()),
+      [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(), min_vertices.size()),
+       max_vertices = raft::device_span<vertex_t const>(max_vertices.data(),
+                                                        max_vertices.size())] __device__(size_t i) {
+        return min_vertices[i] <= max_vertices[i - 1];
+      });
+
+    return (num_violations == 0);
+  } else {
+    unique_minors.resize(
+      thrust::distance(
+        unique_minors.begin(),
+        thrust::remove_if(handle.get_thrust_policy(),
+                          unique_minors.begin(),
+                          unique_minors.end(),
+                          [sorted_unique_majors = raft::device_span<vertex_t const>(
+                             unique_majors.data(), unique_majors.size())] __device__(auto minor) {
+                            return thrust::binary_search(thrust::seq,
+                                                         sorted_unique_majors.begin(),
+                                                         sorted_unique_majors.end(),
+                                                         minor);
+                          })),
+      handle.get_stream());
+
+    auto max_major_renumbered_vertex = thrust::transform_reduce(
+      handle.get_thrust_policy(),
+      unique_majors.begin(),
+      unique_majors.end(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t major) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      },
+      std::numeric_limits<vertex_t>::lowest(),
+      thrust::maximum<vertex_t>{});
+
+    auto min_minor_renumbered_vertex = thrust::transform_reduce(
+      handle.get_thrust_policy(),
+      unique_minors.begin(),
+      unique_minors.end(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t minor) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), minor);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      },
+      std::numeric_limits<vertex_t>::max(),
+      thrust::minimum<vertex_t>{});
+
+    return (max_major_renumbered_vertex < min_minor_renumbered_vertex);
+  }
+}
+
+template <typename input_usecase_t>
+class Tests_SamplingPostProcessing
+  : public ::testing::TestWithParam<std::tuple<SamplingPostProcessing_Usecase, input_usecase_t>> {
+ public:
+  Tests_SamplingPostProcessing() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(
+    std::tuple<SamplingPostProcessing_Usecase const&, input_usecase_t const&> const& param)
+  {
+    using label_t     = int32_t;
+    using weight_t    = float;
+    using edge_id_t   = vertex_t;
+    using edge_type_t = int32_t;
+
+    bool constexpr store_transposed = false;
+    bool constexpr renumber         = true;
+    bool constexpr test_weighted    = true;
+
+    auto [sampling_post_processing_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, test_weighted, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(0);
+
+    rmm::device_uvector<vertex_t> starting_vertices(
+      sampling_post_processing_usecase.num_labels *
+        sampling_post_processing_usecase.num_seeds_per_label,
+      handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         starting_vertices.data(),
+                                         starting_vertices.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+    auto starting_vertex_labels = (sampling_post_processing_usecase.num_labels > 1)
+                                    ? std::make_optional<rmm::device_uvector<label_t>>(
+                                        starting_vertices.size(), handle.get_stream())
+                                    : std::nullopt;
+    if (starting_vertex_labels) {
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        (*starting_vertex_labels).begin(),
+        (*starting_vertex_labels).end(),
+        [num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label] __device__(
+          size_t i) { return static_cast<label_t>(i / num_seeds_per_label); });
+    }
+
+    rmm::device_uvector<vertex_t> org_edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> org_edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> org_edgelist_weights{std::nullopt};
+    std::optional<rmm::device_uvector<int32_t>> org_edgelist_hops{std::nullopt};
+    std::optional<rmm::device_uvector<label_t>> org_labels{std::nullopt};
+    std::optional<rmm::device_uvector<size_t>> org_edgelist_label_offsets{std::nullopt};
+    std::tie(org_edgelist_srcs,
+             org_edgelist_dsts,
+             org_edgelist_weights,
+             std::ignore,
+             std::ignore,
+             org_edgelist_hops,
+             org_labels,
+             org_edgelist_label_offsets) = cugraph::uniform_neighbor_sample<vertex_t,
+                                                                            edge_t,
+                                                                            weight_t,
+                                                                            edge_type_t,
+                                                                            label_t,
+                                                                            store_transposed,
+                                                                            false>(
+      handle,
+      graph_view,
+      edge_weight_view,
+      std::nullopt,
+      std::nullopt,
+      raft::device_span<vertex_t const>(starting_vertices.data(), starting_vertices.size()),
+      starting_vertex_labels ? std::make_optional<raft::device_span<label_t const>>(
+                                 (*starting_vertex_labels).data(), (*starting_vertex_labels).size())
+                             : std::nullopt,
+      std::nullopt,
+      raft::host_span<int32_t const>(sampling_post_processing_usecase.fanouts.data(),
+                                     sampling_post_processing_usecase.fanouts.size()),
+      rng_state,
+      sampling_post_processing_usecase.fanouts.size() > 1,
+      sampling_post_processing_usecase.sample_with_replacement,
+      (!sampling_post_processing_usecase.compress_per_hop &&
+       (sampling_post_processing_usecase.fanouts.size() > 1))
+        ? cugraph::prior_sources_behavior_t::EXCLUDE
+        : cugraph::prior_sources_behavior_t::DEFAULT,
+      false);
+
+    if (!sampling_post_processing_usecase.src_is_major) {
+      std::swap(org_edgelist_srcs, org_edgelist_dsts);
+    }
+
+    starting_vertices.resize(0, handle.get_stream());
+    starting_vertices.shrink_to_fit(handle.get_stream());
+    starting_vertex_labels = std::nullopt;
+
+    {
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_srcs(org_edgelist_srcs.size(),
+                                                                        handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_dsts(org_edgelist_dsts.size(),
+                                                                        handle.get_stream());
+      auto renumbered_and_sorted_edgelist_weights =
+        org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                 (*org_edgelist_weights).size(), handle.get_stream())
+                             : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> renumbered_and_sorted_edgelist_edge_ids{
+        std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_sorted_edgelist_edge_types{
+        std::nullopt};
+      auto renumbered_and_sorted_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(renumbered_and_sorted_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(renumbered_and_sorted_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (renumbered_and_sorted_edgelist_weights) {
+        raft::copy((*renumbered_and_sorted_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_sorted_edgelist_hops) {
+        raft::copy(std::get<0>(*renumbered_and_sorted_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_sorted_edgelist_label_hop_offsets{
+        std::nullopt};
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_renumber_map(0, handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_sorted_renumber_map_label_offsets{
+        std::nullopt};
+
+      {
+        size_t free_size{};
+        size_t total_size{};
+        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
+                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
+                  << std::endl;
+      }
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Renumber and sort sampled edgelist");
+      }
+
+      std::tie(renumbered_and_sorted_edgelist_srcs,
+               renumbered_and_sorted_edgelist_dsts,
+               renumbered_and_sorted_edgelist_weights,
+               renumbered_and_sorted_edgelist_edge_ids,
+               renumbered_and_sorted_edgelist_edge_types,
+               renumbered_and_sorted_edgelist_label_hop_offsets,
+               renumbered_and_sorted_renumber_map,
+               renumbered_and_sorted_renumber_map_label_offsets) =
+        cugraph::renumber_and_sort_sampled_edgelist(
+          handle,
+          std::move(renumbered_and_sorted_edgelist_srcs),
+          std::move(renumbered_and_sorted_edgelist_dsts),
+          std::move(renumbered_and_sorted_edgelist_weights),
+          std::move(renumbered_and_sorted_edgelist_edge_ids),
+          std::move(renumbered_and_sorted_edgelist_edge_types),
+          std::move(renumbered_and_sorted_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (renumbered_and_sorted_edgelist_label_hop_offsets) {
+          ASSERT_TRUE((*renumbered_and_sorted_edgelist_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Renumbered and sorted edge list (label,hop) offset array size should coincide with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).begin(),
+                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).end()))
+            << "Renumbered and sorted edge list (label,hop) offset array values should be "
+               "non-decreasing.";
+        }
+
+        if (renumbered_and_sorted_renumber_map_label_offsets) {
+          ASSERT_TRUE((*renumbered_and_sorted_renumber_map_label_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels + 1)
+            << "Renumbered and sorted offset (label, hop) offset array size should coincide with "
+               "the number of labels + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*renumbered_and_sorted_renumber_map_label_offsets).begin(),
+                                        (*renumbered_and_sorted_renumber_map_label_offsets).end()))
+            << "Renumbered and sorted renumber map label offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE(
+            (*renumbered_and_sorted_renumber_map_label_offsets).back_element(handle.get_stream()) ==
+            renumbered_and_sorted_renumber_map.size())
+            << "Renumbered and sorted renumber map label offset array's last value should coincide "
+               "with the renumber map size.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          auto this_label_output_edgelist_srcs = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_edgelist_srcs.data() + edgelist_start_offset,
+            edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_dsts = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_edgelist_dsts.data() + edgelist_start_offset,
+            edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_weights =
+            renumbered_and_sorted_edgelist_weights
+              ? std::make_optional<raft::device_span<weight_t const>>(
+                  (*renumbered_and_sorted_edgelist_weights).data() + edgelist_start_offset,
+                  edgelist_end_offset - edgelist_start_offset)
+              : std::nullopt;
+
+          size_t renumber_map_start_offset =
+            renumbered_and_sorted_renumber_map_label_offsets
+              ? (*renumbered_and_sorted_renumber_map_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t renumber_map_end_offset      = renumbered_and_sorted_renumber_map_label_offsets
+                                                  ? (*renumbered_and_sorted_renumber_map_label_offsets)
+                                                 .element(i + 1, handle.get_stream())
+                                                  : renumbered_and_sorted_renumber_map.size();
+          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_renumber_map.data() + renumber_map_start_offset,
+            renumber_map_end_offset - renumber_map_start_offset);
+
+          // check whether the edges are properly sorted
+
+          auto this_label_output_edgelist_majors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_srcs
+                                                     : this_label_output_edgelist_dsts;
+          auto this_label_output_edgelist_minors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_dsts
+                                                     : this_label_output_edgelist_srcs;
+
+          if (this_label_org_edgelist_hops) {
+            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            for (size_t j = 0; j < num_hops; ++j) {
+              auto hop_start_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                        .element(i * num_hops + j, handle.get_stream()) -
+                                      (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                        .element(i * num_hops, handle.get_stream());
+              auto hop_end_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                      .element(i * num_hops + j + 1, handle.get_stream()) -
+                                    (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                      .element(i * num_hops, handle.get_stream());
+              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                            edge_first + hop_start_offset,
+                                            edge_first + hop_end_offset))
+                << "Renumbered and sorted output edges are not properly sorted.";
+            }
+          } else {
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                          edge_first,
+                                          edge_first + this_label_output_edgelist_majors.size()))
+              << "Renumbered and sorted output edges are not properly sorted.";
+          }
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(compare_edgelist(handle,
+                                       this_label_org_edgelist_srcs,
+                                       this_label_org_edgelist_dsts,
+                                       this_label_org_edgelist_weights,
+                                       this_label_output_edgelist_srcs,
+                                       this_label_output_edgelist_dsts,
+                                       this_label_output_edgelist_weights,
+                                       std::make_optional(this_label_output_renumber_map)))
+            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+               "edgelist.";
+
+          // Check the invariants in renumber_map
+
+          ASSERT_TRUE(check_renumber_map_invariants(handle,
+                                                    this_label_org_edgelist_srcs,
+                                                    this_label_org_edgelist_dsts,
+                                                    this_label_org_edgelist_hops,
+                                                    this_label_output_renumber_map,
+                                                    sampling_post_processing_usecase.src_is_major))
+            << "Renumbered and sorted output renumber map violates invariants.";
+        }
+      }
+    }
+
+    {
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_srcs(
+        org_edgelist_srcs.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_dsts(
+        org_edgelist_dsts.size(), handle.get_stream());
+      auto renumbered_and_compressed_edgelist_weights =
+        org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                 (*org_edgelist_weights).size(), handle.get_stream())
+                             : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> renumbered_and_compressed_edgelist_edge_ids{
+        std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_compressed_edgelist_edge_types{
+        std::nullopt};
+      auto renumbered_and_compressed_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(renumbered_and_compressed_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(renumbered_and_compressed_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (renumbered_and_compressed_edgelist_weights) {
+        raft::copy((*renumbered_and_compressed_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_compressed_edgelist_hops) {
+        raft::copy(std::get<0>(*renumbered_and_compressed_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<vertex_t>> renumbered_and_compressed_nzd_vertices{
+        std::nullopt};
+      rmm::device_uvector<size_t> renumbered_and_compressed_offsets(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_minors(0,
+                                                                              handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_compressed_offset_label_hop_offsets{
+        std::nullopt};
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_renumber_map(0, handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>>
+        renumbered_and_compressed_renumber_map_label_offsets{std::nullopt};
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Renumber and compressed sampled edgelist");
+      }
+
+      std::tie(renumbered_and_compressed_nzd_vertices,
+               renumbered_and_compressed_offsets,
+               renumbered_and_compressed_edgelist_minors,
+               renumbered_and_compressed_edgelist_weights,
+               renumbered_and_compressed_edgelist_edge_ids,
+               renumbered_and_compressed_edgelist_edge_types,
+               renumbered_and_compressed_offset_label_hop_offsets,
+               renumbered_and_compressed_renumber_map,
+               renumbered_and_compressed_renumber_map_label_offsets) =
+        cugraph::renumber_and_compress_sampled_edgelist(
+          handle,
+          std::move(renumbered_and_compressed_edgelist_srcs),
+          std::move(renumbered_and_compressed_edgelist_dsts),
+          std::move(renumbered_and_compressed_edgelist_weights),
+          std::move(renumbered_and_compressed_edgelist_edge_ids),
+          std::move(renumbered_and_compressed_edgelist_edge_types),
+          std::move(renumbered_and_compressed_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major,
+          sampling_post_processing_usecase.compress_per_hop,
+          sampling_post_processing_usecase.doubly_compress);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (renumbered_and_compressed_nzd_vertices) {
+          ASSERT_TRUE(renumbered_and_compressed_offsets.size() ==
+                      (*renumbered_and_compressed_nzd_vertices).size() + 1)
+            << "Renumbered and compressed offset array size should coincide with the number of "
+               "non-zero-degree vertices + 1.";
+        }
+
+        ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                      renumbered_and_compressed_offsets.begin(),
+                                      renumbered_and_compressed_offsets.end()))
+          << "Renumbered and compressed offset array values should be non-decreasing.";
+
+        ASSERT_TRUE(renumbered_and_compressed_offsets.back_element(handle.get_stream()) ==
+                    renumbered_and_compressed_edgelist_minors.size())
+          << "Renumbered and compressed offset array's last value should coincide with the number "
+             "of "
+             "edges.";
+
+        if (renumbered_and_compressed_offset_label_hop_offsets) {
+          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Renumbered and compressed offset (label,hop) offset array size should coincide "
+               "with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(
+            thrust::is_sorted(handle.get_thrust_policy(),
+                              (*renumbered_and_compressed_offset_label_hop_offsets).begin(),
+                              (*renumbered_and_compressed_offset_label_hop_offsets).end()))
+            << "Renumbered and compressed offset (label,hop) offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets)
+                        .back_element(handle.get_stream()) ==
+                      renumbered_and_compressed_offsets.size() - 1)
+            << "Renumbered and compressed offset (label,hop) offset array's last value should "
+               "coincide with the offset array size - 1.";
+        }
+
+        if (renumbered_and_compressed_renumber_map_label_offsets) {
+          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels + 1)
+            << "Renumbered and compressed offset (label, hop) offset array size should coincide "
+               "with "
+               "the number of labels + 1.";
+
+          ASSERT_TRUE(
+            thrust::is_sorted(handle.get_thrust_policy(),
+                              (*renumbered_and_compressed_renumber_map_label_offsets).begin(),
+                              (*renumbered_and_compressed_renumber_map_label_offsets).end()))
+            << "Renumbered and compressed renumber map label offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets)
+                        .back_element(handle.get_stream()) ==
+                      renumbered_and_compressed_renumber_map.size())
+            << "Renumbered and compressed renumber map label offset array's last value should "
+               "coincide with the renumber map size.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          rmm::device_uvector<vertex_t> this_label_output_edgelist_srcs(0, handle.get_stream());
+          rmm::device_uvector<vertex_t> this_label_output_edgelist_dsts(0, handle.get_stream());
+          auto this_label_output_edgelist_weights =
+            renumbered_and_compressed_edgelist_weights
+              ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
+              : std::nullopt;
+          this_label_output_edgelist_srcs.reserve(edgelist_end_offset - edgelist_start_offset,
+                                                  handle.get_stream());
+          this_label_output_edgelist_dsts.reserve(edgelist_end_offset - edgelist_start_offset,
+                                                  handle.get_stream());
+          if (this_label_output_edgelist_weights) {
+            (*this_label_output_edgelist_weights)
+              .reserve(edgelist_end_offset - edgelist_start_offset, handle.get_stream());
+          }
+
+          // decompress
+
+          auto num_hops = sampling_post_processing_usecase.fanouts.size();
+          for (size_t j = 0; j < num_hops; ++j) {
+            auto offset_start_offset = renumbered_and_compressed_offset_label_hop_offsets
+                                         ? (*renumbered_and_compressed_offset_label_hop_offsets)
+                                             .element(i * num_hops + j, handle.get_stream())
+                                         : size_t{0};
+            auto offset_end_offset   = renumbered_and_compressed_offset_label_hop_offsets
+                                         ? ((*renumbered_and_compressed_offset_label_hop_offsets)
+                                            .element(i * num_hops + j + 1, handle.get_stream()) +
+                                          1)
+                                         : renumbered_and_compressed_offsets.size();
+
+            auto base_v =
+              (!sampling_post_processing_usecase.doubly_compress &&
+               !sampling_post_processing_usecase.compress_per_hop && (j > 0))
+                ? static_cast<vertex_t>(offset_start_offset -
+                                        (*renumbered_and_compressed_offset_label_hop_offsets)
+                                          .element(i * num_hops, handle.get_stream()))
+                : vertex_t{0};
+
+            raft::device_span<size_t const> d_offsets(
+              renumbered_and_compressed_offsets.data() + offset_start_offset,
+              offset_end_offset - offset_start_offset);
+            std::vector<size_t> h_offsets(d_offsets.size());
+            raft::update_host(
+              h_offsets.data(), d_offsets.data(), h_offsets.size(), handle.get_stream());
+            handle.sync_stream();
+
+            auto old_size = this_label_output_edgelist_srcs.size();
+            this_label_output_edgelist_srcs.resize(old_size + (h_offsets.back() - h_offsets[0]),
+                                                   handle.get_stream());
+            this_label_output_edgelist_dsts.resize(this_label_output_edgelist_srcs.size(),
+                                                   handle.get_stream());
+            if (this_label_output_edgelist_weights) {
+              (*this_label_output_edgelist_weights)
+                .resize(this_label_output_edgelist_srcs.size(), handle.get_stream());
+            }
+            thrust::transform(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(h_offsets[0]),
+              thrust::make_counting_iterator(h_offsets.back()),
+              (sampling_post_processing_usecase.src_is_major
+                 ? this_label_output_edgelist_srcs.begin()
+                 : this_label_output_edgelist_dsts.begin()) +
+                old_size,
+              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+               nzd_vertices =
+                 renumbered_and_compressed_nzd_vertices
+                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                       (offset_end_offset - offset_start_offset) - 1)
+                   : thrust::nullopt,
+               base_v] __device__(size_t i) {
+                auto idx = static_cast<size_t>(thrust::distance(
+                  offsets.begin() + 1,
+                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                if (nzd_vertices) {
+                  return (*nzd_vertices)[idx];
+                } else {
+                  return base_v + static_cast<vertex_t>(idx);
+                }
+              });
+            thrust::copy(handle.get_thrust_policy(),
+                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
+                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),
+                         (sampling_post_processing_usecase.src_is_major
+                            ? this_label_output_edgelist_dsts.begin()
+                            : this_label_output_edgelist_srcs.begin()) +
+                           old_size);
+            if (this_label_output_edgelist_weights) {
+              thrust::copy(handle.get_thrust_policy(),
+                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets[0],
+                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets.back(),
+                           (*this_label_output_edgelist_weights).begin() + old_size);
+            }
+          }
+
+          size_t renumber_map_start_offset =
+            renumbered_and_compressed_renumber_map_label_offsets
+              ? (*renumbered_and_compressed_renumber_map_label_offsets)
+                  .element(i, handle.get_stream())
+              : size_t{0};
+          size_t renumber_map_end_offset =
+            renumbered_and_compressed_renumber_map_label_offsets
+              ? (*renumbered_and_compressed_renumber_map_label_offsets)
+                  .element(i + 1, handle.get_stream())
+              : renumbered_and_compressed_renumber_map.size();
+          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
+            renumbered_and_compressed_renumber_map.data() + renumber_map_start_offset,
+            renumber_map_end_offset - renumber_map_start_offset);
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(compare_edgelist(
+            handle,
+            this_label_org_edgelist_srcs,
+            this_label_org_edgelist_dsts,
+            this_label_org_edgelist_weights,
+            raft::device_span<vertex_t const>(this_label_output_edgelist_srcs.data(),
+                                              this_label_output_edgelist_srcs.size()),
+            raft::device_span<vertex_t const>(this_label_output_edgelist_dsts.data(),
+                                              this_label_output_edgelist_dsts.size()),
+            this_label_output_edgelist_weights
+              ? std::make_optional<raft::device_span<weight_t const>>(
+                  (*this_label_output_edgelist_weights).data(),
+                  (*this_label_output_edgelist_weights).size())
+              : std::nullopt,
+            std::make_optional(this_label_output_renumber_map)))
+            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+               "edgelist.";
+
+          // Check the invariants in renumber_map
+
+          ASSERT_TRUE(check_renumber_map_invariants(handle,
+                                                    this_label_org_edgelist_srcs,
+                                                    this_label_org_edgelist_dsts,
+                                                    this_label_org_edgelist_hops,
+                                                    this_label_output_renumber_map,
+                                                    sampling_post_processing_usecase.src_is_major))
+            << "Renumbered and sorted output renumber map violates invariants.";
+        }
+      }
+    }
+
+    {
+      rmm::device_uvector<vertex_t> sorted_edgelist_srcs(org_edgelist_srcs.size(),
+                                                         handle.get_stream());
+      rmm::device_uvector<vertex_t> sorted_edgelist_dsts(org_edgelist_dsts.size(),
+                                                         handle.get_stream());
+      auto sorted_edgelist_weights = org_edgelist_weights
+                                       ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                           (*org_edgelist_weights).size(), handle.get_stream())
+                                       : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> sorted_edgelist_edge_ids{std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> sorted_edgelist_edge_types{std::nullopt};
+      auto sorted_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(sorted_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(sorted_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (sorted_edgelist_weights) {
+        raft::copy((*sorted_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (sorted_edgelist_hops) {
+        raft::copy(std::get<0>(*sorted_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<size_t>> sorted_edgelist_label_hop_offsets{std::nullopt};
+
+      {
+        size_t free_size{};
+        size_t total_size{};
+        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
+                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
+                  << std::endl;
+      }
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Sort sampled edgelist");
+      }
+
+      std::tie(sorted_edgelist_srcs,
+               sorted_edgelist_dsts,
+               sorted_edgelist_weights,
+               sorted_edgelist_edge_ids,
+               sorted_edgelist_edge_types,
+               sorted_edgelist_label_hop_offsets) =
+        cugraph::sort_sampled_edgelist(
+          handle,
+          std::move(sorted_edgelist_srcs),
+          std::move(sorted_edgelist_dsts),
+          std::move(sorted_edgelist_weights),
+          std::move(sorted_edgelist_edge_ids),
+          std::move(sorted_edgelist_edge_types),
+          std::move(sorted_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (sorted_edgelist_label_hop_offsets) {
+          ASSERT_TRUE((*sorted_edgelist_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Sorted edge list (label,hop) offset array size should coincide with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*sorted_edgelist_label_hop_offsets).begin(),
+                                        (*sorted_edgelist_label_hop_offsets).end()))
+            << "Sorted edge list (label,hop) offset array values should be "
+               "non-decreasing.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          auto this_label_output_edgelist_srcs =
+            raft::device_span<vertex_t const>(sorted_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_dsts =
+            raft::device_span<vertex_t const>(sorted_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_weights =
+            sorted_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                        (*sorted_edgelist_weights).data() + edgelist_start_offset,
+                                        edgelist_end_offset - edgelist_start_offset)
+                                    : std::nullopt;
+
+          // check whether the edges are properly sorted
+
+          auto this_label_output_edgelist_majors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_srcs
+                                                     : this_label_output_edgelist_dsts;
+          auto this_label_output_edgelist_minors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_dsts
+                                                     : this_label_output_edgelist_srcs;
+
+          if (this_label_org_edgelist_hops) {
+            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            for (size_t j = 0; j < num_hops; ++j) {
+              auto hop_start_offset =
+                (*sorted_edgelist_label_hop_offsets)
+                  .element(i * num_hops + j, handle.get_stream()) -
+                (*sorted_edgelist_label_hop_offsets).element(i * num_hops, handle.get_stream());
+              auto hop_end_offset =
+                (*sorted_edgelist_label_hop_offsets)
+                  .element(i * num_hops + j + 1, handle.get_stream()) -
+                (*sorted_edgelist_label_hop_offsets).element(i * num_hops, handle.get_stream());
+              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                            edge_first + hop_start_offset,
+                                            edge_first + hop_end_offset))
+                << "Renumbered and sorted output edges are not properly sorted.";
+            }
+          } else {
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                          edge_first,
+                                          edge_first + this_label_output_edgelist_majors.size()))
+              << "Renumbered and sorted output edges are not properly sorted.";
+          }
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(
+            compare_edgelist(handle,
+                             this_label_org_edgelist_srcs,
+                             this_label_org_edgelist_dsts,
+                             this_label_org_edgelist_weights,
+                             this_label_output_edgelist_srcs,
+                             this_label_output_edgelist_dsts,
+                             this_label_output_edgelist_weights,
+                             std::optional<raft::device_span<vertex_t const>>{std::nullopt}))
+            << "Sorted edge list does not coincide with the original edgelist.";
+        }
+      }
+    }
+  }
+};
+
+using Tests_SamplingPostProcessing_File = Tests_SamplingPostProcessing<cugraph::test::File_Usecase>;
+using Tests_SamplingPostProcessing_Rmat = Tests_SamplingPostProcessing<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SamplingPostProcessing_File, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt32Int64)
+{
+  run_current_test<int32_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt64Int64)
+{
+  run_current_test<int64_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SamplingPostProcessing_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 4, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, true}),
+    ::testing::Values(cugraph::test::File_Usecase("karate.mtx"),
+                      cugraph::test::File_Usecase("dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SamplingPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, true, false, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_SamplingPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, true, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()

From b2e85bff39a411d02f5a167f7bfab376ae9ccb67 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 19 Sep 2023 13:33:01 -0400
Subject: [PATCH 06/22] Update `cugraph-dgl` conv layers to use improved graph
 class (#3849)

This PR:
- Removes the usage of the deprecated `StaticCSC` and `SampledCSC`
- Support creating CSR and storing edge information in SparseGraph
- clean up unit tests
- Adds GATv2Conv layer
- Adds `pylibcugraphops` as a dependency of `cugraph-dgl` conda package

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3849
---
 conda/recipes/cugraph-dgl/meta.yaml           |   1 +
 .../cugraph_dgl/nn/conv/__init__.py           |   2 +
 .../cugraph-dgl/cugraph_dgl/nn/conv/base.py   | 262 +++++++++++++-----
 .../cugraph_dgl/nn/conv/gatconv.py            | 140 +++++++---
 .../cugraph_dgl/nn/conv/gatv2conv.py          | 249 +++++++++++++++++
 .../cugraph_dgl/nn/conv/relgraphconv.py       |  70 ++---
 .../cugraph_dgl/nn/conv/sageconv.py           | 122 ++++----
 .../cugraph_dgl/nn/conv/transformerconv.py    |  20 +-
 python/cugraph-dgl/tests/conftest.py          |   3 +
 python/cugraph-dgl/tests/nn/test_gatconv.py   | 100 ++++---
 python/cugraph-dgl/tests/nn/test_gatv2conv.py | 147 ++++++++++
 .../cugraph-dgl/tests/nn/test_relgraphconv.py |  71 +++--
 python/cugraph-dgl/tests/nn/test_sageconv.py  |  65 +++--
 .../cugraph-dgl/tests/nn/test_sparsegraph.py  |  28 +-
 .../tests/nn/test_transformerconv.py          |  41 ++-
 python/cugraph-dgl/tests/test_dataset.py      |   2 +-
 ...ograph.py => test_from_dgl_heterograph.py} |   0
 17 files changed, 978 insertions(+), 345 deletions(-)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
 create mode 100644 python/cugraph-dgl/tests/nn/test_gatv2conv.py
 rename python/cugraph-dgl/tests/{test_from_dgl_hetrograph.py => test_from_dgl_heterograph.py} (100%)

diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 2fbc6360c04..9e9fcd2faf1 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -26,6 +26,7 @@ requirements:
     - dgl >=1.1.0.cu*
     - numba >=0.57
     - numpy >=1.21
+    - pylibcugraphops ={{ version }}
     - python
     - pytorch
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
index e5acbf34478..3e7f2f076f0 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
@@ -13,6 +13,7 @@
 
 from .base import SparseGraph
 from .gatconv import GATConv
+from .gatv2conv import GATv2Conv
 from .relgraphconv import RelGraphConv
 from .sageconv import SAGEConv
 from .transformerconv import TransformerConv
@@ -20,6 +21,7 @@
 __all__ = [
     "SparseGraph",
     "GATConv",
+    "GATv2Conv",
     "RelGraphConv",
     "SAGEConv",
     "TransformerConv",
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index 0eeaed29d86..307eb33078e 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -17,38 +17,7 @@
 
 torch = import_optional("torch")
 ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class BaseConv(torch.nn.Module):
-    r"""An abstract base class for cugraph-ops nn module."""
-
-    def __init__(self):
-        super().__init__()
-        self._cached_offsets_fg = None
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        raise NotImplementedError
-
-    def forward(self, *args):
-        r"""Runs the forward pass of the module."""
-        raise NotImplementedError
-
-    def pad_offsets(self, offsets: torch.Tensor, size: int) -> torch.Tensor:
-        r"""Pad zero-in-degree nodes to the end of offsets to reach size. This
-        is used to augment offset tensors from DGL blocks (MFGs) to be
-        compatible with cugraph-ops full-graph primitives."""
-        if self._cached_offsets_fg is None:
-            self._cached_offsets_fg = torch.empty(
-                size, dtype=offsets.dtype, device=offsets.device
-            )
-        elif self._cached_offsets_fg.numel() < size:
-            self._cached_offsets_fg.resize_(size)
-
-        self._cached_offsets_fg[: offsets.numel()] = offsets
-        self._cached_offsets_fg[offsets.numel() : size] = offsets[-1]
-
-        return self._cached_offsets_fg[:size]
+dgl = import_optional("dgl")
 
 
 def compress_ids(ids: torch.Tensor, size: int) -> torch.Tensor:
@@ -63,8 +32,9 @@ def decompress_ids(c_ids: torch.Tensor) -> torch.Tensor:
 
 
 class SparseGraph(object):
-    r"""A god-class to store different sparse formats needed by cugraph-ops
-    and facilitate sparse format conversions.
+    r"""A class to create and store different sparse formats needed by
+    cugraph-ops. It always creates a CSC representation and can provide COO- or
+    CSR-format if needed.
 
     Parameters
     ----------
@@ -89,25 +59,43 @@ class SparseGraph(object):
         consists of the sources between `src_indices[cdst_indices[k]]` and
         `src_indices[cdst_indices[k+1]]`.
 
-    dst_ids_is_sorted: bool
-        Whether `dst_ids` has been sorted in an ascending order. When sorted,
-        creating CSC layout is much faster.
+    values: torch.Tensor, optional
+        Values on the edges.
+
+    is_sorted: bool
+        Whether the COO inputs (src_ids, dst_ids, values) have been sorted by
+        `dst_ids` in an ascending order. CSC layout creation is much faster
+        when sorted.
 
     formats: str or tuple of str, optional
-        The desired sparse formats to create for the graph.
+        The desired sparse formats to create for the graph. The formats tuple
+        must include "csc". Default: "csc".
 
     reduce_memory: bool, optional
         When set, the tensors are not required by the desired formats will be
-        set to `None`.
+        set to `None`. Default: True.
 
     Notes
     -----
     For MFGs (sampled graphs), the node ids must have been renumbered.
     """
 
-    supported_formats = {"coo": ("src_ids", "dst_ids"), "csc": ("cdst_ids", "src_ids")}
-
-    all_tensors = set(["src_ids", "dst_ids", "csrc_ids", "cdst_ids"])
+    supported_formats = {
+        "coo": ("_src_ids", "_dst_ids"),
+        "csc": ("_cdst_ids", "_src_ids"),
+        "csr": ("_csrc_ids", "_dst_ids", "_perm_csc2csr"),
+    }
+
+    all_tensors = set(
+        [
+            "_src_ids",
+            "_dst_ids",
+            "_csrc_ids",
+            "_cdst_ids",
+            "_perm_coo2csc",
+            "_perm_csc2csr",
+        ]
+    )
 
     def __init__(
         self,
@@ -116,15 +104,19 @@ def __init__(
         dst_ids: Optional[torch.Tensor] = None,
         csrc_ids: Optional[torch.Tensor] = None,
         cdst_ids: Optional[torch.Tensor] = None,
-        dst_ids_is_sorted: bool = False,
-        formats: Optional[Union[str, Tuple[str]]] = None,
+        values: Optional[torch.Tensor] = None,
+        is_sorted: bool = False,
+        formats: Union[str, Tuple[str]] = "csc",
         reduce_memory: bool = True,
     ):
         self._num_src_nodes, self._num_dst_nodes = size
-        self._dst_ids_is_sorted = dst_ids_is_sorted
+        self._is_sorted = is_sorted
 
         if dst_ids is None and cdst_ids is None:
-            raise ValueError("One of 'dst_ids' and 'cdst_ids' must be given.")
+            raise ValueError(
+                "One of 'dst_ids' and 'cdst_ids' must be given "
+                "to create a SparseGraph."
+            )
 
         if src_ids is not None:
             src_ids = src_ids.contiguous()
@@ -148,21 +140,40 @@ def __init__(
                 )
             cdst_ids = cdst_ids.contiguous()
 
+        if values is not None:
+            values = values.contiguous()
+
         self._src_ids = src_ids
         self._dst_ids = dst_ids
         self._csrc_ids = csrc_ids
         self._cdst_ids = cdst_ids
-        self._perm = None
+        self._values = values
+        self._perm_coo2csc = None
+        self._perm_csc2csr = None
 
         if isinstance(formats, str):
             formats = (formats,)
-
-        if formats is not None:
-            for format_ in formats:
-                assert format_ in SparseGraph.supported_formats
-                self.__getattribute__(f"_create_{format_}")()
         self._formats = formats
 
+        if "csc" not in formats:
+            raise ValueError(
+                f"{self.__class__.__name__}.formats must contain "
+                f"'csc', but got {formats}."
+            )
+
+        # always create csc first
+        if self._cdst_ids is None:
+            if not self._is_sorted:
+                self._dst_ids, self._perm_coo2csc = torch.sort(self._dst_ids)
+                self._src_ids = self._src_ids[self._perm_coo2csc]
+                if self._values is not None:
+                    self._values = self._values[self._perm_coo2csc]
+            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
+
+        for format_ in formats:
+            assert format_ in SparseGraph.supported_formats
+            self.__getattribute__(f"{format_}")()
+
         self._reduce_memory = reduce_memory
         if reduce_memory:
             self.reduce_memory()
@@ -170,8 +181,6 @@ def __init__(
     def reduce_memory(self):
         """Remove the tensors that are not necessary to create the desired sparse
         formats to reduce memory footprint."""
-
-        self._perm = None
         if self._formats is None:
             return
 
@@ -181,16 +190,22 @@ def reduce_memory(self):
         for t in SparseGraph.all_tensors.difference(set(tensors_needed)):
             self.__dict__[t] = None
 
-    def _create_coo(self):
+    def src_ids(self) -> torch.Tensor:
+        return self._src_ids
+
+    def cdst_ids(self) -> torch.Tensor:
+        return self._cdst_ids
+
+    def dst_ids(self) -> torch.Tensor:
         if self._dst_ids is None:
             self._dst_ids = decompress_ids(self._cdst_ids)
+        return self._dst_ids
 
-    def _create_csc(self):
-        if self._cdst_ids is None:
-            if not self._dst_ids_is_sorted:
-                self._dst_ids, self._perm = torch.sort(self._dst_ids)
-                self._src_ids = self._src_ids[self._perm]
-            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
+    def csrc_ids(self) -> torch.Tensor:
+        if self._csrc_ids is None:
+            src_ids, self._perm_csc2csr = torch.sort(self._src_ids)
+            self._csrc_ids = compress_ids(src_ids, self._num_src_nodes)
+        return self._csrc_ids
 
     def num_src_nodes(self):
         return self._num_src_nodes
@@ -198,21 +213,134 @@ def num_src_nodes(self):
     def num_dst_nodes(self):
         return self._num_dst_nodes
 
+    def values(self):
+        return self._values
+
     def formats(self):
         return self._formats
 
-    def coo(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def coo(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         if "coo" not in self.formats():
             raise RuntimeError(
                 "The SparseGraph did not create a COO layout. "
-                "Set 'formats' to include 'coo' when creating the graph."
+                "Set 'formats' list to include 'coo' when creating the graph."
             )
-        return (self._src_ids, self._dst_ids)
+        return self.src_ids(), self.dst_ids(), self._values
 
-    def csc(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def csc(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         if "csc" not in self.formats():
             raise RuntimeError(
                 "The SparseGraph did not create a CSC layout. "
-                "Set 'formats' to include 'csc' when creating the graph."
+                "Set 'formats' list to include 'csc' when creating the graph."
+            )
+        return self.cdst_ids(), self.src_ids(), self._values
+
+    def csr(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        if "csr" not in self.formats():
+            raise RuntimeError(
+                "The SparseGraph did not create a CSR layout. "
+                "Set 'formats' list to include 'csr' when creating the graph."
+            )
+        csrc_ids = self.csrc_ids()
+        dst_ids = self.dst_ids()[self._perm_csc2csr]
+        value = self._values
+        if value is not None:
+            value = value[self._perm_csc2csr]
+        return csrc_ids, dst_ids, value
+
+
+class BaseConv(torch.nn.Module):
+    r"""An abstract base class for cugraph-ops nn module."""
+
+    def __init__(self):
+        super().__init__()
+
+    def reset_parameters(self):
+        r"""Resets all learnable parameters of the module."""
+        raise NotImplementedError
+
+    def forward(self, *args):
+        r"""Runs the forward pass of the module."""
+        raise NotImplementedError
+
+    def get_cugraph_ops_CSC(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        is_bipartite: bool = False,
+        max_in_degree: Optional[int] = None,
+    ) -> ops_torch.CSC:
+        """Create CSC structure needed by cugraph-ops."""
+
+        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
+            raise TypeError(
+                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
+                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
             )
-        return (self._cdst_ids, self._src_ids)
+
+        # TODO: max_in_degree should default to None in pylibcugraphops
+        if max_in_degree is None:
+            max_in_degree = -1
+
+        if isinstance(g, SparseGraph):
+            offsets, indices, _ = g.csc()
+        else:
+            offsets, indices, _ = g.adj_tensors("csc")
+
+        graph = ops_torch.CSC(
+            offsets=offsets,
+            indices=indices,
+            num_src_nodes=g.num_src_nodes(),
+            dst_max_in_degree=max_in_degree,
+            is_bipartite=is_bipartite,
+        )
+
+        return graph
+
+    def get_cugraph_ops_HeteroCSC(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        num_edge_types: int,
+        etypes: Optional[torch.Tensor] = None,
+        is_bipartite: bool = False,
+        max_in_degree: Optional[int] = None,
+    ) -> ops_torch.HeteroCSC:
+        """Create HeteroCSC structure needed by cugraph-ops."""
+
+        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
+            raise TypeError(
+                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
+                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
+            )
+
+        # TODO: max_in_degree should default to None in pylibcugraphops
+        if max_in_degree is None:
+            max_in_degree = -1
+
+        if isinstance(g, SparseGraph):
+            offsets, indices, etypes = g.csc()
+            if etypes is None:
+                raise ValueError(
+                    "SparseGraph must have 'values' to create HeteroCSC. "
+                    "Pass in edge types as 'values' when creating the SparseGraph."
+                )
+            etypes = etypes.int()
+        else:
+            if etypes is None:
+                raise ValueError(
+                    "'etypes' is required when creating HeteroCSC "
+                    "from dgl.DGLHeteroGraph."
+                )
+            offsets, indices, perm = g.adj_tensors("csc")
+            etypes = etypes[perm].int()
+
+        graph = ops_torch.HeteroCSC(
+            offsets=offsets,
+            indices=indices,
+            edge_types=etypes,
+            num_src_nodes=g.num_src_nodes(),
+            num_edge_types=num_edge_types,
+            dst_max_in_degree=max_in_degree,
+            is_bipartite=is_bipartite,
+        )
+
+        return graph
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 239def5b677..8843e61ad89 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -10,13 +10,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for graph attention network layer using the aggregation
-primitives in cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -32,13 +29,15 @@ class GATConv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int, pair of ints
+    in_feats : int or tuple
         Input feature size. A pair denotes feature sizes of source and
         destination nodes.
     out_feats : int
         Output feature size.
     num_heads : int
-        Number of heads in Multi-Head Attention.
+        Number of heads in multi-head attention.
+    feat_drop : float, optional
+        Dropout rate on feature. Defaults: ``0``.
     concat : bool, optional
         If False, the multi-head attentions are averaged instead of concatenated.
         Default: ``True``.
@@ -46,6 +45,15 @@ class GATConv(BaseConv):
         Edge feature size. Default: ``None``.
     negative_slope : float, optional
         LeakyReLU angle of negative slope. Defaults: ``0.2``.
+    residual : bool, optional
+        If True, use residual connection. Defaults: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will
+        be invalid since no message will be passed to those nodes. This is
+        harmful for some applications causing silent performance regression.
+        This module will raise a DGLError if it detects 0-in-degree nodes in
+        input graph. By setting ``True``, it will suppress the check and let the
+        users handle it by themselves. Defaults: ``False``.
     bias : bool, optional
         If True, learns a bias term. Defaults: ``True``.
 
@@ -81,37 +89,46 @@ class GATConv(BaseConv):
             [ 1.6477, -1.9986],
             [ 1.1138, -1.9302]]], device='cuda:0', grad_fn=<ViewBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 200
 
     def __init__(
         self,
         in_feats: Union[int, Tuple[int, int]],
         out_feats: int,
         num_heads: int,
+        feat_drop: float = 0.0,
         concat: bool = True,
         edge_feats: Optional[int] = None,
         negative_slope: float = 0.2,
+        residual: bool = False,
+        allow_zero_in_degree: bool = False,
         bias: bool = True,
     ):
         super().__init__()
         self.in_feats = in_feats
         self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
+        self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.allow_zero_in_degree = allow_zero_in_degree
 
         if isinstance(in_feats, int):
-            self.fc = nn.Linear(in_feats, num_heads * out_feats, bias=False)
+            self.lin = nn.Linear(in_feats, num_heads * out_feats, bias=False)
         else:
-            self.fc_src = nn.Linear(in_feats[0], num_heads * out_feats, bias=False)
-            self.fc_dst = nn.Linear(in_feats[1], num_heads * out_feats, bias=False)
+            self.lin_src = nn.Linear(
+                self.in_feats_src, num_heads * out_feats, bias=False
+            )
+            self.lin_dst = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=False
+            )
 
         if edge_feats is not None:
-            self.fc_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
+            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
             self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
         else:
-            self.register_parameter("fc_edge", None)
+            self.register_parameter("lin_edge", None)
             self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
 
         if bias and concat:
@@ -121,28 +138,40 @@ def __init__(
         else:
             self.register_buffer("bias", None)
 
+        self.residual = residual and self.in_feats_dst != out_feats * num_heads
+        if self.residual:
+            self.lin_res = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+        else:
+            self.register_buffer("lin_res", None)
+
         self.reset_parameters()
 
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
-        if hasattr(self, "fc"):
-            nn.init.xavier_normal_(self.fc.weight, gain=gain)
+        if hasattr(self, "lin"):
+            nn.init.xavier_normal_(self.lin.weight, gain=gain)
         else:
-            nn.init.xavier_normal_(self.fc_src.weight, gain=gain)
-            nn.init.xavier_normal_(self.fc_dst.weight, gain=gain)
+            nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
+            nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
 
         nn.init.xavier_normal_(
             self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
-        if self.fc_edge is not None:
-            self.fc_edge.reset_parameters()
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        if self.lin_res is not None:
+            self.lin_res.reset_parameters()
+
         if self.bias is not None:
             nn.init.zeros_(self.bias)
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
@@ -151,18 +180,17 @@ def forward(
 
         Parameters
         ----------
-        graph : DGLGraph
+        graph : DGLGraph or SparseGraph
             The graph.
         nfeat : torch.Tensor
             Input features of shape :math:`(N, D_{in})`.
         efeat: torch.Tensor, optional
             Optional edge features.
         max_in_degree : int
-            Maximum in-degree of destination nodes. It is only effective when
-            :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
-            :attr:`g` is generated from a neighbor sampler, the value should be
-            set to the corresponding :attr:`fanout`. If not given,
-            :attr:`max_in_degree` will be calculated on-the-fly.
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
 
         Returns
         -------
@@ -171,49 +199,63 @@ def forward(
             :math:`H` is the number of heads, and :math:`D_{out}` is size of
             output feature.
         """
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        bipartite = not isinstance(nfeat, torch.Tensor)
-        offsets, indices, _ = g.adj_tensors("csc")
-
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=bipartite,
+        if isinstance(g, dgl.DGLHeteroGraph):
+            if not self.allow_zero_in_degree:
+                if (g.in_degrees() == 0).any():
+                    raise dgl.base.DGLError(
+                        "There are 0-in-degree nodes in the graph, "
+                        "output for those nodes will be invalid. "
+                        "This is harmful for some applications, "
+                        "causing silent performance regression. "
+                        "Adding self-loop on the input graph by "
+                        "calling `g = dgl.add_self_loop(g)` will resolve "
+                        "the issue. Setting ``allow_zero_in_degree`` "
+                        "to be `True` when constructing this module will "
+                        "suppress the check and let the code run."
+                    )
+
+        bipartite = isinstance(nfeat, (list, tuple))
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=bipartite, max_in_degree=max_in_degree
         )
 
+        if bipartite:
+            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
+            nfeat_dst_orig = nfeat[1]
+        else:
+            nfeat = self.feat_drop(nfeat)
+            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
+
         if efeat is not None:
-            if self.fc_edge is None:
+            if self.lin_edge is None:
                 raise RuntimeError(
                     f"{self.__class__.__name__}.edge_feats must be set to "
                     f"accept edge features."
                 )
-            efeat = self.fc_edge(efeat)
+            efeat = self.lin_edge(efeat)
 
         if bipartite:
-            if not hasattr(self, "fc_src"):
+            if not hasattr(self, "lin_src"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats must be a pair of "
                     f"integers to allow bipartite node features, but got "
                     f"{self.in_feats}."
                 )
-            nfeat_src = self.fc_src(nfeat[0])
-            nfeat_dst = self.fc_dst(nfeat[1])
+            nfeat_src = self.lin_src(nfeat[0])
+            nfeat_dst = self.lin_dst(nfeat[1])
         else:
-            if not hasattr(self, "fc"):
+            if not hasattr(self, "lin"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats is expected to be an "
                     f"integer, but got {self.in_feats}."
                 )
-            nfeat = self.fc(nfeat)
+            nfeat = self.lin(nfeat)
 
         out = ops_torch.operators.mha_gat_n2n(
             (nfeat_src, nfeat_dst) if bipartite else nfeat,
             self.attn_weights,
-            graph,
+            _graph,
             num_heads=self.num_heads,
             activation="LeakyReLU",
             negative_slope=self.negative_slope,
@@ -224,6 +266,12 @@ def forward(
         if self.concat:
             out = out.view(-1, self.num_heads, self.out_feats)
 
+        if self.residual:
+            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
+            if not self.concat:
+                res = res.mean(dim=1)
+            out = out + res
+
         if self.bias is not None:
             out = out + self.bias
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
new file mode 100644
index 00000000000..209a5fe1a8d
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
+from cugraph.utilities.utils import import_optional
+
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+nn = import_optional("torch.nn")
+ops_torch = import_optional("pylibcugraphops.pytorch")
+
+
+class GATv2Conv(BaseConv):
+    r"""GATv2 from `How Attentive are Graph Attention Networks?
+    <https://arxiv.org/pdf/2105.14491.pdf>`__, with the sparse aggregation
+    accelerated by cugraph-ops.
+
+    Parameters
+    ----------
+    in_feats : int, or pair of ints
+        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
+        If the layer is to be applied to a unidirectional bipartite graph, `in_feats`
+        specifies the input feature size on both the source and destination nodes.
+        If a scalar is given, the source and destination node feature size
+        would take the same value.
+    out_feats : int
+        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
+    num_heads : int
+        Number of heads in Multi-Head Attention.
+    feat_drop : float, optional
+        Dropout rate on feature. Defaults: ``0``.
+    concat : bool, optional
+        If False, the multi-head attentions are averaged instead of concatenated.
+        Default: ``True``.
+    edge_feats : int, optional
+        Edge feature size. Default: ``None``.
+    negative_slope : float, optional
+        LeakyReLU angle of negative slope. Defaults: ``0.2``.
+    residual : bool, optional
+        If True, use residual connection. Defaults: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will
+        be invalid since no message will be passed to those nodes. This is
+        harmful for some applications causing silent performance regression.
+        This module will raise a DGLError if it detects 0-in-degree nodes in
+        input graph. By setting ``True``, it will suppress the check and let the
+        users handle it by themselves. Defaults: ``False``.
+    bias : bool, optional
+        If set to :obj:`False`, the layer will not learn
+        an additive bias. (default: :obj:`True`)
+    share_weights : bool, optional
+        If set to :obj:`True`, the same matrix for :math:`W_{left}` and
+        :math:`W_{right}` in the above equations, will be applied to the source
+        and the target node of every edge. (default: :obj:`False`)
+    """
+
+    def __init__(
+        self,
+        in_feats: Union[int, Tuple[int, int]],
+        out_feats: int,
+        num_heads: int,
+        feat_drop: float = 0.0,
+        concat: bool = True,
+        edge_feats: Optional[int] = None,
+        negative_slope: float = 0.2,
+        residual: bool = False,
+        allow_zero_in_degree: bool = False,
+        bias: bool = True,
+        share_weights: bool = False,
+    ):
+        super().__init__()
+        self.in_feats = in_feats
+        self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
+        self.num_heads = num_heads
+        self.feat_drop = nn.Dropout(feat_drop)
+        self.concat = concat
+        self.edge_feats = edge_feats
+        self.negative_slope = negative_slope
+        self.allow_zero_in_degree = allow_zero_in_degree
+        self.share_weights = share_weights
+
+        self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
+        if share_weights:
+            if self.in_feats_src != self.in_feats_dst:
+                raise ValueError(
+                    f"Input feature size of source and destination "
+                    f"nodes must be identical when share_weights is enabled, "
+                    f"but got {self.in_feats_src} and {self.in_feats_dst}."
+                )
+            self.lin_dst = self.lin_src
+        else:
+            self.lin_dst = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+
+        self.attn = nn.Parameter(torch.Tensor(num_heads * out_feats))
+
+        if edge_feats is not None:
+            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
+        else:
+            self.register_parameter("lin_edge", None)
+
+        if bias and concat:
+            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
+        elif bias and not concat:
+            self.bias = nn.Parameter(torch.Tensor(out_feats))
+        else:
+            self.register_buffer("bias", None)
+
+        self.residual = residual and self.in_feats_dst != out_feats * num_heads
+        if self.residual:
+            self.lin_res = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+        else:
+            self.register_buffer("lin_res", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        r"""Reinitialize learnable parameters."""
+        gain = nn.init.calculate_gain("relu")
+        nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
+        nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
+
+        nn.init.xavier_normal_(
+            self.attn.view(-1, self.num_heads, self.out_feats), gain=gain
+        )
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        if self.lin_res is not None:
+            self.lin_res.reset_parameters()
+
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+
+    def forward(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        efeat: Optional[torch.Tensor] = None,
+        max_in_degree: Optional[int] = None,
+    ) -> torch.Tensor:
+        r"""Forward computation.
+
+        Parameters
+        ----------
+        graph : DGLGraph or SparseGraph
+            The graph.
+        nfeat : torch.Tensor
+            Input features of shape :math:`(N, D_{in})`.
+        efeat: torch.Tensor, optional
+            Optional edge features.
+        max_in_degree : int
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, H, D_{out})` where
+            :math:`H` is the number of heads, and :math:`D_{out}` is size of
+            output feature.
+        """
+
+        if isinstance(g, dgl.DGLHeteroGraph):
+            if not self.allow_zero_in_degree:
+                if (g.in_degrees() == 0).any():
+                    raise dgl.base.DGLError(
+                        "There are 0-in-degree nodes in the graph, "
+                        "output for those nodes will be invalid. "
+                        "This is harmful for some applications, "
+                        "causing silent performance regression. "
+                        "Adding self-loop on the input graph by "
+                        "calling `g = dgl.add_self_loop(g)` will resolve "
+                        "the issue. Setting ``allow_zero_in_degree`` "
+                        "to be `True` when constructing this module will "
+                        "suppress the check and let the code run."
+                    )
+
+        nfeat_bipartite = isinstance(nfeat, (list, tuple))
+        graph_bipartite = nfeat_bipartite or self.share_weights is False
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
+        )
+
+        if nfeat_bipartite:
+            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
+            nfeat_dst_orig = nfeat[1]
+        else:
+            nfeat = self.feat_drop(nfeat)
+            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
+
+        if efeat is not None:
+            if self.lin_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_feats must be set to "
+                    f"accept edge features."
+                )
+            efeat = self.lin_edge(efeat)
+
+        if nfeat_bipartite:
+            nfeat = (self.lin_src(nfeat[0]), self.lin_dst(nfeat[1]))
+        elif graph_bipartite:
+            nfeat = (self.lin_src(nfeat), self.lin_dst(nfeat[: g.num_dst_nodes()]))
+        else:
+            nfeat = self.lin_src(nfeat)
+
+        out = ops_torch.operators.mha_gat_v2_n2n(
+            nfeat,
+            self.attn,
+            _graph,
+            num_heads=self.num_heads,
+            activation="LeakyReLU",
+            negative_slope=self.negative_slope,
+            concat_heads=self.concat,
+            edge_feat=efeat,
+        )[: g.num_dst_nodes()]
+
+        if self.concat:
+            out = out.view(-1, self.num_heads, self.out_feats)
+
+        if self.residual:
+            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
+            if not self.concat:
+                res = res.mean(dim=1)
+            out = out + res
+
+        if self.bias is not None:
+            out = out + self.bias
+
+        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index 89e49011cf7..54916674210 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -10,14 +10,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for Relational graph convolution layer using the aggregation
-primitives in cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
+
 import math
-from typing import Optional
+from typing import Optional, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -29,13 +26,8 @@
 class RelGraphConv(BaseConv):
     r"""An accelerated relational graph convolution layer from `Modeling
     Relational Data with Graph Convolutional Networks
-    <https://arxiv.org/abs/1703.06103>`__ that leverages the highly-optimized
-    aggregation primitives in cugraph-ops.
-
-    See :class:`dgl.nn.pytorch.conv.RelGraphConv` for mathematical model.
-
-    This module depends on :code:`pylibcugraphops` package, which can be
-    installed via :code:`conda install -c nvidia pylibcugraphops>=23.02`.
+    <https://arxiv.org/abs/1703.06103>`__, with the sparse aggregation
+    accelerated by cugraph-ops.
 
     Parameters
     ----------
@@ -84,7 +76,6 @@ class RelGraphConv(BaseConv):
             [-1.4335, -2.3758],
             [-1.4331, -2.3295]], device='cuda:0', grad_fn=<AddBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 500
 
     def __init__(
         self,
@@ -148,7 +139,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         feat: torch.Tensor,
         etypes: torch.Tensor,
         max_in_degree: Optional[int] = None,
@@ -167,49 +158,24 @@ def forward(
             so any input of other integer types will be casted into int32,
             thus introducing some overhead. Pass in int32 tensors directly
             for best performance.
-        max_in_degree : int, optional
-            Maximum in-degree of destination nodes. It is only effective when
-            :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
-            :attr:`g` is generated from a neighbor sampler, the value should be
-            set to the corresponding :attr:`fanout`. If not given,
-            :attr:`max_in_degree` will be calculated on-the-fly.
+        max_in_degree : int
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
 
         Returns
         -------
         torch.Tensor
             New node features. Shape: :math:`(|V|, D_{out})`.
         """
-        offsets, indices, edge_ids = g.adj_tensors("csc")
-        edge_types_perm = etypes[edge_ids.long()].int()
-
-        if g.is_block:
-            if max_in_degree is None:
-                max_in_degree = g.in_degrees().max().item()
-
-            if max_in_degree < self.MAX_IN_DEGREE_MFG:
-                _graph = ops_torch.SampledHeteroCSC(
-                    offsets,
-                    indices,
-                    edge_types_perm,
-                    max_in_degree,
-                    g.num_src_nodes(),
-                    self.num_rels,
-                )
-            else:
-                offsets_fg = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-                _graph = ops_torch.StaticHeteroCSC(
-                    offsets_fg,
-                    indices,
-                    edge_types_perm,
-                    self.num_rels,
-                )
-        else:
-            _graph = ops_torch.StaticHeteroCSC(
-                offsets,
-                indices,
-                edge_types_perm,
-                self.num_rels,
-            )
+        _graph = self.get_cugraph_ops_HeteroCSC(
+            g,
+            num_edge_types=self.num_rels,
+            etypes=etypes,
+            is_bipartite=False,
+            max_in_degree=max_in_degree,
+        )
 
         h = ops_torch.operators.agg_hg_basis_n2n_post(
             feat,
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index 60f4c505e19..a3f946d7cb4 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -10,11 +10,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for GraphSAGE layer using the aggregation primitives in
-cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
-from typing import Optional, Union
+
+from typing import Optional, Tuple, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -27,22 +24,18 @@
 
 class SAGEConv(BaseConv):
     r"""An accelerated GraphSAGE layer from `Inductive Representation Learning
-    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`__ that leverages the
-    highly-optimized aggregation primitives in cugraph-ops.
-
-    See :class:`dgl.nn.pytorch.conv.SAGEConv` for mathematical model.
-
-    This module depends on :code:`pylibcugraphops` package, which can be
-    installed via :code:`conda install -c nvidia pylibcugraphops>=23.02`.
+    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`, with the sparse
+    aggregation accelerated by cugraph-ops.
 
     Parameters
     ----------
-    in_feats : int
-        Input feature size.
+    in_feats : int or tuple
+        Input feature size. If a scalar is given, the source and destination
+        nodes are required to be the same.
     out_feats : int
         Output feature size.
     aggregator_type : str
-        Aggregator type to use (``mean``, ``sum``, ``min``, ``max``).
+        Aggregator type to use ("mean", "sum", "min", "max", "pool", "gcn").
     feat_drop : float
         Dropout rate on features, default: ``0``.
     bias : bool
@@ -68,38 +61,57 @@ class SAGEConv(BaseConv):
             [-1.1690,  0.1952],
             [-1.1690,  0.1952]], device='cuda:0', grad_fn=<AddmmBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 500
+    valid_aggr_types = {"mean", "sum", "min", "max", "pool", "gcn"}
 
     def __init__(
         self,
-        in_feats: int,
+        in_feats: Union[int, Tuple[int, int]],
         out_feats: int,
         aggregator_type: str = "mean",
         feat_drop: float = 0.0,
         bias: bool = True,
     ):
         super().__init__()
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        valid_aggr_types = {"max", "min", "mean", "sum"}
-        if aggregator_type not in valid_aggr_types:
+
+        if aggregator_type not in self.valid_aggr_types:
             raise ValueError(
-                f"Invalid aggregator_type. Must be one of {valid_aggr_types}. "
+                f"Invalid aggregator_type. Must be one of {self.valid_aggr_types}. "
                 f"But got '{aggregator_type}' instead."
             )
-        self.aggr = aggregator_type
+
+        self.aggregator_type = aggregator_type
+        self._aggr = aggregator_type
+        self.in_feats = in_feats
+        self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.feat_drop = nn.Dropout(feat_drop)
 
-        self.linear = nn.Linear(2 * in_feats, out_feats, bias=bias)
+        if self.aggregator_type == "gcn":
+            self._aggr = "mean"
+            self.lin = nn.Linear(self.in_feats_src, out_feats, bias=bias)
+        else:
+            self.lin = nn.Linear(
+                self.in_feats_src + self.in_feats_dst, out_feats, bias=bias
+            )
+
+        if self.aggregator_type == "pool":
+            self._aggr = "max"
+            self.pre_lin = nn.Linear(self.in_feats_src, self.in_feats_src)
+        else:
+            self.register_parameter("pre_lin", None)
+
+        self.reset_parameters()
 
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
-        self.linear.reset_parameters()
+        self.lin.reset_parameters()
+        if self.pre_lin is not None:
+            self.pre_lin.reset_parameters()
 
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: torch.Tensor,
+        feat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
@@ -108,7 +120,7 @@ def forward(
         ----------
         g : DGLGraph or SparseGraph
             The graph.
-        feat : torch.Tensor
+        feat : torch.Tensor or tuple
             Node features. Shape: :math:`(|V|, D_{in})`.
         max_in_degree : int
             Maximum in-degree of destination nodes. When :attr:`g` is generated
@@ -121,36 +133,34 @@ def forward(
         torch.Tensor
             Output node features. Shape: :math:`(|V|, D_{out})`.
         """
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            assert "csc" in g.formats()
-            offsets, indices = g.csc()
-            _graph = ops_torch.CSC(
-                offsets=offsets,
-                indices=indices,
-                num_src_nodes=g.num_src_nodes(),
-                dst_max_in_degree=max_in_degree,
-            )
-        elif isinstance(g, dgl.DGLHeteroGraph):
-            offsets, indices, _ = g.adj_tensors("csc")
-            _graph = ops_torch.CSC(
-                offsets=offsets,
-                indices=indices,
-                num_src_nodes=g.num_src_nodes(),
-                dst_max_in_degree=max_in_degree,
-            )
-        else:
-            raise TypeError(
-                f"The graph has to be either a 'SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
+        feat_bipartite = isinstance(feat, (list, tuple))
+        graph_bipartite = feat_bipartite or self.aggregator_type == "pool"
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
+        )
 
-        feat = self.feat_drop(feat)
-        h = ops_torch.operators.agg_concat_n2n(feat, _graph, self.aggr)[
+        if feat_bipartite:
+            feat = (self.feat_drop(feat[0]), self.feat_drop(feat[1]))
+        else:
+            feat = self.feat_drop(feat)
+
+        if self.aggregator_type == "pool":
+            if feat_bipartite:
+                feat = (self.pre_lin(feat[0]).relu(), feat[1])
+            else:
+                feat = (self.pre_lin(feat).relu(), feat[: g.num_dst_nodes()])
+            # force ctx.needs_input_grad=True in cugraph-ops autograd function
+            feat[0].requires_grad_()
+            feat[1].requires_grad_()
+
+        out = ops_torch.operators.agg_concat_n2n(feat, _graph, self._aggr)[
             : g.num_dst_nodes()
         ]
-        h = self.linear(h)
 
-        return h
+        if self.aggregator_type == "gcn":
+            out = out[:, : self.in_feats_src]
+
+        out = self.lin(out)
+
+        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 5cd5fbbaebe..8481b9ee265 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -10,9 +10,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Optional, Tuple, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -114,7 +115,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -130,17 +131,12 @@ def forward(
         efeat: torch.Tensor, optional
             Edge feature tensor. Default: ``None``.
         """
-        offsets, indices, _ = g.adj_tensors("csc")
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            is_bipartite=True,
-        )
-
-        if isinstance(nfeat, torch.Tensor):
+        feat_bipartite = isinstance(nfeat, (list, tuple))
+        if not feat_bipartite:
             nfeat = (nfeat, nfeat)
 
+        _graph = self.get_cugraph_ops_CSC(g, is_bipartite=True)
+
         query = self.lin_query(nfeat[1][: g.num_dst_nodes()])
         key = self.lin_key(nfeat[0])
         value = self.lin_value(nfeat[0])
@@ -157,7 +153,7 @@ def forward(
             key_emb=key,
             query_emb=query,
             value_emb=value,
-            graph=graph,
+            graph=_graph,
             num_heads=self.num_heads,
             concat_heads=self.concat,
             edge_emb=efeat,
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index 6f8690d1140..a3863ed81fa 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -40,16 +40,19 @@ class SparseGraphData1:
     nnz = 6
     src_ids = torch.IntTensor([0, 1, 2, 3, 2, 5]).cuda()
     dst_ids = torch.IntTensor([1, 2, 3, 4, 0, 3]).cuda()
+    values = torch.IntTensor([10, 20, 30, 40, 50, 60]).cuda()
 
     # CSR
     src_ids_sorted_by_src = torch.IntTensor([0, 1, 2, 2, 3, 5]).cuda()
     dst_ids_sorted_by_src = torch.IntTensor([1, 2, 0, 3, 4, 3]).cuda()
     csrc_ids = torch.IntTensor([0, 1, 2, 4, 5, 5, 6]).cuda()
+    values_csr = torch.IntTensor([10, 20, 50, 30, 40, 60]).cuda()
 
     # CSC
     src_ids_sorted_by_dst = torch.IntTensor([2, 0, 1, 5, 2, 3]).cuda()
     dst_ids_sorted_by_dst = torch.IntTensor([0, 1, 2, 3, 3, 4]).cuda()
     cdst_ids = torch.IntTensor([0, 1, 2, 3, 5, 6]).cuda()
+    values_csc = torch.IntTensor([50, 10, 20, 60, 30, 40]).cuda()
 
 
 @pytest.fixture
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index 7ed65645a28..ef3047dc2cd 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -10,69 +10,84 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import GATConv as CuGraphGATConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
-def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_block):
-    GATConv = dgl.nn.GATConv
-    CuGraphGATConv = cugraph_dgl.nn.GATConv
-    device = "cuda"
-    g = create_graph1().to(device)
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_gatconv_equality(
+    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+):
+    from dgl.nn.pytorch import GATConv
+
+    g = create_graph1().to("cuda")
 
     if idtype_int:
         g = g.int()
-
     if to_block:
         g = dgl.to_block(g)
 
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
     out_feats = 2
 
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False}
+    kwargs = {"bias": False, "allow_zero_in_degree": True}
 
-    conv1 = GATConv(*args, **kwargs, allow_zero_in_degree=True).to(device)
+    conv1 = GATConv(*args, **kwargs).cuda()
     out1 = conv1(g, nfeat)
 
-    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATConv(*args, **kwargs).cuda()
     dim = num_heads * out_feats
     with torch.no_grad():
         conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
         conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
         if bipartite:
-            conv2.fc_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-            conv2.fc_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+            conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
+            conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
         else:
-            conv2.fc.weight.data = conv1.fc.weight.data.detach().clone()
-    out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+            conv2.lin.weight.data = conv1.fc.weight.data.detach().clone()
+        if residual and conv2.residual:
+            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
 
-    assert torch.allclose(out1, out2, atol=1e-6)
+    if sparse_format is not None:
+        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out1 = torch.rand_like(out1)
     grad_out2 = grad_out1.clone().detach()
@@ -81,18 +96,18 @@ def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_bl
 
     if bipartite:
         assert torch.allclose(
-            conv1.fc_src.weight.grad, conv2.fc_src.weight.grad, atol=1e-6
+            conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
         assert torch.allclose(
-            conv1.fc_dst.weight.grad, conv2.fc_dst.weight.grad, atol=1e-6
+            conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
         )
     else:
-        assert torch.allclose(conv1.fc.weight.grad, conv2.fc.weight.grad, atol=1e-6)
+        assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
         conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=1e-6,
+        atol=ATOL,
     )
 
 
@@ -106,10 +121,7 @@ def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_bl
 def test_gatconv_edge_feats(
     bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
 ):
-    from cugraph_dgl.nn import GATConv
-
-    device = "cuda"
-    g = create_graph1().to(device)
+    g = create_graph1().to("cuda")
 
     if to_block:
         g = dgl.to_block(g)
@@ -117,24 +129,30 @@ def test_gatconv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
+        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
     else:
         edge_feats = None
         efeat = None
 
-    conv = GATConv(
-        in_feats, out_feats, num_heads, concat=concat, edge_feats=edge_feats, bias=bias
-    ).to(device)
+    conv = CuGraphGATConv(
+        in_feats,
+        out_feats,
+        num_heads,
+        concat=concat,
+        edge_feats=edge_feats,
+        bias=bias,
+        allow_zero_in_degree=True,
+    ).cuda()
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
     grad_out = torch.rand_like(out)
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
new file mode 100644
index 00000000000..cc46a6e4b39
--- /dev/null
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
+from .common import create_graph1
+
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
+
+
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("residual", [False, True])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_gatv2conv_equality(
+    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+):
+    from dgl.nn.pytorch import GATv2Conv
+
+    g = create_graph1().to("cuda")
+
+    if idtype_int:
+        g = g.int()
+    if to_block:
+        g = dgl.to_block(g)
+
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+    out_feats = 2
+
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
+    args = (in_feats, out_feats, num_heads)
+    kwargs = {"bias": False, "allow_zero_in_degree": True}
+
+    conv1 = GATv2Conv(*args, **kwargs).cuda()
+    out1 = conv1(g, nfeat)
+
+    conv2 = CuGraphGATv2Conv(*args, **kwargs).cuda()
+    with torch.no_grad():
+        conv2.attn.data = conv1.attn.data.flatten()
+        conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
+        conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        if residual and conv2.residual:
+            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+
+    if sparse_format is not None:
+        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
+
+    grad_out1 = torch.rand_like(out1)
+    grad_out2 = grad_out1.clone().detach()
+    out1.backward(grad_out1)
+    out2.backward(grad_out2)
+
+    assert torch.allclose(
+        conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
+    )
+    assert torch.allclose(
+        conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
+    )
+
+    assert torch.allclose(conv1.attn.grad, conv1.attn.grad, atol=ATOL)
+
+
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("concat", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("use_edge_feats", [False, True])
+def test_gatv2conv_edge_feats(
+    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+):
+    g = create_graph1().to("cuda")
+
+    if to_block:
+        g = dgl.to_block(g)
+
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+    out_feats = 2
+
+    if use_edge_feats:
+        edge_feats = 3
+        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+    else:
+        edge_feats = None
+        efeat = None
+
+    conv = CuGraphGATv2Conv(
+        in_feats,
+        out_feats,
+        num_heads,
+        concat=concat,
+        edge_feats=edge_feats,
+        bias=bias,
+        allow_zero_in_degree=True,
+    ).cuda()
+    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
+
+    grad_out = torch.rand_like(out)
+    out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index d2ae6a23978..901f9ba1433 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -10,20 +10,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("idtype_int", [False, True])
@@ -32,12 +29,17 @@
 @pytest.mark.parametrize("regularizer", [None, "basis"])
 @pytest.mark.parametrize("self_loop", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_relgraphconv_equality(
-    idtype_int, max_in_degree, num_bases, regularizer, self_loop, to_block
+    idtype_int,
+    max_in_degree,
+    num_bases,
+    regularizer,
+    self_loop,
+    to_block,
+    sparse_format,
 ):
-    RelGraphConv = dgl.nn.RelGraphConv
-    CuGraphRelGraphConv = cugraph_dgl.nn.RelGraphConv
-    device = "cuda"
+    from dgl.nn.pytorch import RelGraphConv
 
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
@@ -47,34 +49,57 @@ def test_relgraphconv_equality(
         "bias": False,
         "self_loop": self_loop,
     }
-    g = create_graph1().to(device)
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
+    g = create_graph1().to("cuda")
+    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).cuda()
+
     if idtype_int:
         g = g.int()
     if to_block:
         g = dgl.to_block(g)
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
+
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+    feat = torch.rand(g.num_src_nodes(), in_feat).cuda()
+
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size,
+            src_ids=g.edges()[0],
+            dst_ids=g.edges()[1],
+            values=g.edata[dgl.ETYPE],
+            formats="csc",
+        )
+    elif sparse_format == "csc":
+        offsets, indices, perm = g.adj_tensors("csc")
+        etypes = g.edata[dgl.ETYPE][perm]
+        sg = SparseGraph(
+            size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
+        )
 
     torch.manual_seed(0)
-    conv1 = RelGraphConv(*args, **kwargs).to(device)
+    conv1 = RelGraphConv(*args, **kwargs).cuda()
 
     torch.manual_seed(0)
     kwargs["apply_norm"] = False
-    conv2 = CuGraphRelGraphConv(*args, **kwargs).to(device)
+    conv2 = CuGraphRelGraphConv(*args, **kwargs).cuda()
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
-    out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=1e-06)
+
+    if sparse_format is not None:
+        out2 = conv2(sg, feat, sg.values(), max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
 
     end = -1 if self_loop else None
-    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=1e-6)
+    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=ATOL)
 
     if self_loop:
-        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=1e-6)
+        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=ATOL)
 
     if regularizer is not None:
-        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=1e-6)
+        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index 447bbe49460..e2acf9e6596 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -10,31 +10,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-from cugraph.utilities.utils import import_optional
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
 
+ATOL = 1e-6
 
+
+@pytest.mark.parametrize("aggr", ["mean", "pool"])
 @pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_format):
-    SAGEConv = dgl.nn.SAGEConv
-    device = "cuda"
+def test_sageconv_equality(
+    aggr, bias, bipartite, idtype_int, max_in_degree, to_block, sparse_format
+):
+    from dgl.nn.pytorch import SAGEConv
 
-    in_feat, out_feat = 5, 2
-    kwargs = {"aggregator_type": "mean", "bias": bias}
-    g = create_graph1().to(device)
+    kwargs = {"aggregator_type": aggr, "bias": bias}
+    g = create_graph1().to("cuda")
 
     if idtype_int:
         g = g.int()
@@ -42,7 +44,17 @@ def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_for
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
+
+    if bipartite:
+        in_feats = (5, 3)
+        feat = (
+            torch.rand(size[0], in_feats[0], requires_grad=True).cuda(),
+            torch.rand(size[1], in_feats[1], requires_grad=True).cuda(),
+        )
+    else:
+        in_feats = 5
+        feat = torch.rand(size[0], in_feats).cuda()
+    out_feats = 2
 
     if sparse_format == "coo":
         sg = SparseGraph(
@@ -52,39 +64,38 @@ def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_for
         offsets, indices, _ = g.adj_tensors("csc")
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
-    torch.manual_seed(0)
-    conv1 = SAGEConv(in_feat, out_feat, **kwargs).to(device)
-
-    torch.manual_seed(0)
-    conv2 = CuGraphSAGEConv(in_feat, out_feat, **kwargs).to(device)
+    conv1 = SAGEConv(in_feats, out_feats, **kwargs).cuda()
+    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).cuda()
 
+    in_feats_src = conv2.in_feats_src
     with torch.no_grad():
-        conv2.linear.weight.data[:, :in_feat] = conv1.fc_neigh.weight.data
-        conv2.linear.weight.data[:, in_feat:] = conv1.fc_self.weight.data
+        conv2.lin.weight.data[:, :in_feats_src] = conv1.fc_neigh.weight.data
+        conv2.lin.weight.data[:, in_feats_src:] = conv1.fc_self.weight.data
         if bias:
-            conv2.linear.bias.data[:] = conv1.fc_self.bias.data
+            conv2.lin.bias.data[:] = conv1.fc_self.bias.data
+        if aggr == "pool":
+            conv2.pre_lin.weight.data[:] = conv1.fc_pool.weight.data
+            conv2.pre_lin.bias.data[:] = conv1.fc_pool.bias.data
 
     out1 = conv1(g, feat)
     if sparse_format is not None:
         out2 = conv2(sg, feat, max_in_degree=max_in_degree)
     else:
         out2 = conv2(g, feat, max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=1e-06)
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
     assert torch.allclose(
         conv1.fc_neigh.weight.grad,
-        conv2.linear.weight.grad[:, :in_feat],
-        atol=1e-6,
+        conv2.lin.weight.grad[:, :in_feats_src],
+        atol=ATOL,
     )
     assert torch.allclose(
         conv1.fc_self.weight.grad,
-        conv2.linear.weight.grad[:, in_feat:],
-        atol=1e-6,
+        conv2.lin.weight.grad[:, in_feats_src:],
+        atol=ATOL,
     )
     if bias:
-        assert torch.allclose(
-            conv1.fc_self.bias.grad, conv2.linear.bias.grad, atol=1e-6
-        )
+        assert torch.allclose(conv1.fc_self.bias.grad, conv2.lin.bias.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
index 3fb01575d66..09c0df202ff 100644
--- a/python/cugraph-dgl/tests/nn/test_sparsegraph.py
+++ b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
@@ -19,32 +19,42 @@
 
 def test_coo2csc(sparse_graph_1):
     data = sparse_graph_1
-    values = torch.ones(data.nnz).cuda()
+
     g = SparseGraph(
-        size=data.size, src_ids=data.src_ids, dst_ids=data.dst_ids, formats="csc"
+        size=data.size,
+        src_ids=data.src_ids,
+        dst_ids=data.dst_ids,
+        values=data.values,
+        formats=["csc"],
     )
-    cdst_ids, src_ids = g.csc()
+    cdst_ids, src_ids, values = g.csc()
 
     new = torch.sparse_csc_tensor(cdst_ids, src_ids, values).cuda()
     old = torch.sparse_coo_tensor(
-        torch.vstack((data.src_ids, data.dst_ids)), values
+        torch.vstack((data.src_ids, data.dst_ids)), data.values
     ).cuda()
     torch.allclose(new.to_dense(), old.to_dense())
 
 
-def test_csc2coo(sparse_graph_1):
+def test_csc_input(sparse_graph_1):
     data = sparse_graph_1
-    values = torch.ones(data.nnz).cuda()
+
     g = SparseGraph(
         size=data.size,
         src_ids=data.src_ids_sorted_by_dst,
         cdst_ids=data.cdst_ids,
-        formats="coo",
+        values=data.values_csc,
+        formats=["coo", "csc", "csr"],
     )
-    src_ids, dst_ids = g.coo()
+    src_ids, dst_ids, values = g.coo()
 
     new = torch.sparse_coo_tensor(torch.vstack((src_ids, dst_ids)), values).cuda()
     old = torch.sparse_csc_tensor(
-        data.cdst_ids, data.src_ids_sorted_by_dst, values
+        data.cdst_ids, data.src_ids_sorted_by_dst, data.values_csc
     ).cuda()
     torch.allclose(new.to_dense(), old.to_dense())
+
+    csrc_ids, dst_ids, values = g.csr()
+
+    new = torch.sparse_csr_tensor(csrc_ids, dst_ids, values).cuda()
+    torch.allclose(new.to_dense(), old.to_dense())
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 00476b9f0bb..b2b69cb35ab 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -13,16 +13,14 @@
 
 import pytest
 
-try:
-    from cugraph_dgl.nn import TransformerConv
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import TransformerConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("beta", [False, True])
@@ -32,8 +30,16 @@
 @pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_TransformerConv(
-    beta, bipartite_node_feats, concat, idtype_int, num_heads, to_block, use_edge_feats
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_transformerconv(
+    beta,
+    bipartite_node_feats,
+    concat,
+    idtype_int,
+    num_heads,
+    to_block,
+    use_edge_feats,
+    sparse_format,
 ):
     device = "cuda"
     g = create_graph1().to(device)
@@ -44,6 +50,15 @@ def test_TransformerConv(
     if to_block:
         g = dgl.to_block(g)
 
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
     if bipartite_node_feats:
         in_node_feats = (5, 3)
         nfeat = (
@@ -71,6 +86,10 @@ def test_TransformerConv(
         edge_feats=edge_feats,
     ).to(device)
 
-    out = conv(g, nfeat, efeat)
+    if sparse_format is not None:
+        out = conv(sg, nfeat, efeat)
+    else:
+        out = conv(g, nfeat, efeat)
+
     grad_out = torch.rand_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/test_dataset.py b/python/cugraph-dgl/tests/test_dataset.py
index 69d50261e55..5db443dc0d8 100644
--- a/python/cugraph-dgl/tests/test_dataset.py
+++ b/python/cugraph-dgl/tests/test_dataset.py
@@ -123,6 +123,6 @@ def test_homogeneous_sampled_graphs_from_dataframe(return_type, seed_node):
             assert dgl_block.num_src_nodes() == cugraph_dgl_graph.num_src_nodes()
             assert dgl_block.num_dst_nodes() == cugraph_dgl_graph.num_dst_nodes()
             dgl_offsets, dgl_indices, _ = dgl_block.adj_tensors("csc")
-            cugraph_offsets, cugraph_indices = cugraph_dgl_graph.csc()
+            cugraph_offsets, cugraph_indices, _ = cugraph_dgl_graph.csc()
             assert torch.equal(dgl_offsets.to("cpu"), cugraph_offsets.to("cpu"))
             assert torch.equal(dgl_indices.to("cpu"), cugraph_indices.to("cpu"))
diff --git a/python/cugraph-dgl/tests/test_from_dgl_hetrograph.py b/python/cugraph-dgl/tests/test_from_dgl_heterograph.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_from_dgl_hetrograph.py
rename to python/cugraph-dgl/tests/test_from_dgl_heterograph.py

From ed7b1a41fe502c9097c4ac9688f08c1d1e5fd33f Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:48:52 -0400
Subject: [PATCH 07/22] New mtmg API for integration (#3521)

Creating a new API for integrating multi-threaded multi-GPU programs into the cugraph library.

This API will extend our OPG (one [process] per GPU) model to support a single process handling multiple GPUs, and will also ultimately support a multi-node configuration where some compute nodes might not have GPUs.

closes https://github.com/rapidsai/graph_dl/issues/241

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/3521
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/cmake/thirdparty/get_ucp.cmake            |  35 ++
 .../mtmg/detail/device_shared_device_span.hpp |  39 ++
 .../detail/device_shared_device_vector.hpp    |  58 +++
 .../mtmg/detail/device_shared_wrapper.hpp     | 123 +++++
 .../mtmg/detail/per_device_edgelist.hpp       | 275 +++++++++++
 cpp/include/cugraph/mtmg/edge_property.hpp    |  60 +++
 .../cugraph/mtmg/edge_property_view.hpp       |  33 ++
 cpp/include/cugraph/mtmg/edgelist.hpp         |  65 +++
 cpp/include/cugraph/mtmg/graph.hpp            | 136 ++++++
 cpp/include/cugraph/mtmg/graph_view.hpp       |  34 ++
 cpp/include/cugraph/mtmg/handle.hpp           | 111 +++++
 cpp/include/cugraph/mtmg/instance_manager.hpp |  98 ++++
 .../cugraph/mtmg/per_thread_edgelist.hpp      | 174 +++++++
 cpp/include/cugraph/mtmg/renumber_map.hpp     |  40 ++
 .../cugraph/mtmg/renumber_map_view.hpp        |  32 ++
 cpp/include/cugraph/mtmg/resource_manager.hpp | 225 +++++++++
 cpp/include/cugraph/mtmg/vertex_result.hpp    |  40 ++
 .../cugraph/mtmg/vertex_result_view.hpp       |  49 ++
 cpp/src/link_analysis/pagerank_impl.cuh       |   8 +-
 cpp/src/mtmg/vertex_result.cu                 | 167 +++++++
 cpp/tests/CMakeLists.txt                      |   8 +
 cpp/tests/mtmg/threaded_test.cu               | 459 ++++++++++++++++++
 23 files changed, 2268 insertions(+), 3 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_ucp.cmake
 create mode 100644 cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
 create mode 100644 cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
 create mode 100644 cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
 create mode 100644 cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
 create mode 100644 cpp/include/cugraph/mtmg/edge_property.hpp
 create mode 100644 cpp/include/cugraph/mtmg/edge_property_view.hpp
 create mode 100644 cpp/include/cugraph/mtmg/edgelist.hpp
 create mode 100644 cpp/include/cugraph/mtmg/graph.hpp
 create mode 100644 cpp/include/cugraph/mtmg/graph_view.hpp
 create mode 100644 cpp/include/cugraph/mtmg/handle.hpp
 create mode 100644 cpp/include/cugraph/mtmg/instance_manager.hpp
 create mode 100644 cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
 create mode 100644 cpp/include/cugraph/mtmg/renumber_map.hpp
 create mode 100644 cpp/include/cugraph/mtmg/renumber_map_view.hpp
 create mode 100644 cpp/include/cugraph/mtmg/resource_manager.hpp
 create mode 100644 cpp/include/cugraph/mtmg/vertex_result.hpp
 create mode 100644 cpp/include/cugraph/mtmg/vertex_result_view.hpp
 create mode 100644 cpp/src/mtmg/vertex_result.cu
 create mode 100644 cpp/tests/mtmg/threaded_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 69a488de0b8..a6c26ee3b91 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -166,6 +166,7 @@ endif()
 
 include(cmake/thirdparty/get_nccl.cmake)
 include(cmake/thirdparty/get_cuhornet.cmake)
+include(cmake/thirdparty/get_ucp.cmake)
 
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
@@ -292,6 +293,7 @@ set(CUGRAPH_SOURCES
     src/community/triangle_count_mg.cu
     src/traversal/k_hop_nbrs_sg.cu
     src/traversal/k_hop_nbrs_mg.cu
+    src/mtmg/vertex_result.cu
 )
 
 if(USE_CUGRAPH_OPS)
diff --git a/cpp/cmake/thirdparty/get_ucp.cmake b/cpp/cmake/thirdparty/get_ucp.cmake
new file mode 100644
index 00000000000..dcc4956a34e
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_ucp.cmake
@@ -0,0 +1,35 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ucp)
+
+    if(TARGET UCP::UCP)
+        return()
+    endif()
+
+    rapids_find_generate_module(UCP
+        HEADER_NAMES ucp.h
+        LIBRARY_NAMES ucp
+        INCLUDE_SUFFIXES ucp/api
+    )
+
+    # Currently UCP has no CMake build-system so we require
+    # it built and installed on the machine already
+    rapids_find_package(UCP REQUIRED)
+
+endfunction()
+
+find_and_configure_ucp()
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
new file mode 100644
index 00000000000..37398891370
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <raft/core/device_span.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+using device_shared_device_span_t = device_shared_wrapper_t<raft::device_span<T>>;
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
new file mode 100644
index 00000000000..7f3992b73bd
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+class device_shared_device_vector_t : public device_shared_wrapper_t<rmm::device_uvector<T>> {
+  using parent_t = detail::device_shared_wrapper_t<rmm::device_uvector<T>>;
+
+ public:
+  /**
+   * @brief Create a device_shared_device_span (read only view)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    device_shared_device_span_t<T const> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, raft::device_span<T const>{p.second.data(), p.second.size()});
+    });
+
+    return result;
+  }
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
new file mode 100644
index 00000000000..c4cacb401af
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <map>
+#include <mutex>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+class device_shared_wrapper_t {
+ public:
+  using wrapped_t = T;
+
+  device_shared_wrapper_t() = default;
+  device_shared_wrapper_t(device_shared_wrapper_t&& other) : objects_{std::move(other.objects_)} {}
+  device_shared_wrapper_t& operator=(device_shared_wrapper_t&& other)
+  {
+    objects_ = std::move(other.objects_);
+    return *this;
+  }
+
+  /**
+   * @brief Move a wrapped object into the wrapper for this thread
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @param obj     Wrapped object
+   */
+  void set(cugraph::mtmg::handle_t const& handle, wrapped_t&& obj)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+    CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
+
+    objects_.insert(std::make_pair(handle.get_local_rank(), std::move(obj)));
+  }
+
+  /**
+   * @brief Move a wrapped object into the wrapper for this thread
+   *
+   * @param local_rank  Identify which GPU to associated this object with
+   * @param obj         Wrapped object
+   */
+  void set(int local_rank, wrapped_t&& obj)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(local_rank);
+    CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
+
+    objects_.insert(std::make_pair(local_rank, std::move(obj)));
+  }
+
+ public:
+  /**
+   * @brief Get reference to an object for a particular thread
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @return Reference to the wrapped object
+   */
+  wrapped_t& get(cugraph::mtmg::handle_t const& handle)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+    CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
+
+    return pos->second;
+  }
+
+  /**
+   * @brief Get the pointer to an object for a particular thread from this wrapper
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @return Shared pointer the wrapped object
+   */
+  wrapped_t const& get(cugraph::mtmg::handle_t const& handle) const
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+
+    CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
+
+    return pos->second;
+  }
+
+ protected:
+  mutable std::mutex lock_{};
+  std::map<int, wrapped_t> objects_{};
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
new file mode 100644
index 00000000000..8011146ee4f
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+
+// FIXME: Could use std::span once compiler supports C++20
+#include <raft/core/host_span.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief An edgelist for each GPU
+ *
+ * Manages an edge list for edges associated with a particular GPU.  Multiple threads
+ * can call the append() method, possibly concurrently.  To avoid constantly copying
+ * when the buffers fill up, the class will create a device buffer containing a
+ * number of elements specified in the constructor.  When that device buffer is full
+ * we will create a new buffer.
+ *
+ * When we try and use the edgelist we will consolidate the buffers, since at that
+ * time we know the entire size required.
+ *
+ * Important note, the expectation is that this object will be used in two phases:
+ *  1) The append() method will be used to fill buffers with edges
+ *  2) The edges will be consumed to create a graph
+ *
+ * These two phases are expected to be disjoint.  The calling process is expected to
+ * manage some barrier so that all threads are guaranteed to be completed before changing
+ * phases.  If an append() call (part of the filling phase) overlaps with calls to
+ * finalize_buffer(), consolidate_and_shuffle(), get_src(), get_dst(), get_wgt(),
+ * get_edge_id() and get_edge_type() then the behavior is undefined (data might change
+ * in some non-deterministic way).
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class per_device_edgelist_t {
+ public:
+  per_device_edgelist_t()                                        = delete;
+  per_device_edgelist_t(per_device_edgelist_t const&)            = delete;
+  per_device_edgelist_t& operator=(per_device_edgelist_t const&) = delete;
+  per_device_edgelist_t& operator=(per_device_edgelist_t&&)      = delete;
+
+  per_device_edgelist_t(cugraph::mtmg::handle_t const& handle,
+                        size_t device_buffer_size,
+                        bool use_weight,
+                        bool use_edge_id,
+                        bool use_edge_type)
+    : device_buffer_size_{device_buffer_size},
+      current_pos_{0},
+      src_{},
+      dst_{},
+      wgt_{std::nullopt},
+      edge_id_{std::nullopt},
+      edge_type_{std::nullopt}
+  {
+    if (use_weight) { wgt_ = std::make_optional(std::vector<rmm::device_uvector<weight_t>>()); }
+
+    if (use_edge_id) { edge_id_ = std::make_optional(std::vector<rmm::device_uvector<edge_t>>()); }
+
+    if (use_edge_type) {
+      edge_type_ = std::make_optional(std::vector<rmm::device_uvector<edge_type_t>>());
+    }
+
+    create_new_buffers(handle);
+  }
+
+  per_device_edgelist_t(per_device_edgelist_t&& other)
+    : device_buffer_size_{other.device_buffer_size_},
+      current_pos_{other.current_pos_},
+      src_{std::move(other.src_)},
+      dst_{std::move(other.dst_)},
+      wgt_{std::move(other.wgt_)},
+      edge_id_{std::move(other.edge_id_)},
+      edge_type_{std::move(other.edge_type_)}
+  {
+  }
+
+  /**
+   * @brief Append a list of edges to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              raft::host_span<vertex_t const> src,
+              raft::host_span<vertex_t const> dst,
+              std::optional<raft::host_span<weight_t const>> wgt,
+              std::optional<raft::host_span<edge_t const>> edge_id,
+              std::optional<raft::host_span<edge_type_t const>> edge_type)
+  {
+    // FIXME:  This lock guard could be on a smaller region, but it
+    //   would require more careful coding.  The raft::update_device
+    //   calls could be done without the lock if we made a local
+    //   of the values of *.back() and did an increment of current_pos_
+    //   while we hold the lock.
+    std::lock_guard<std::mutex> lock(lock_);
+
+    size_t count = src.size();
+    size_t pos   = 0;
+
+    while (count > 0) {
+      size_t copy_count = std::min(count, (src_.back().size() - current_pos_));
+
+      raft::update_device(
+        src_.back().begin() + current_pos_, src.begin() + pos, copy_count, handle.get_stream());
+      raft::update_device(
+        dst_.back().begin() + current_pos_, dst.begin() + pos, copy_count, handle.get_stream());
+      if (wgt)
+        raft::update_device(
+          wgt_->back().begin() + current_pos_, wgt->begin() + pos, copy_count, handle.get_stream());
+      if (edge_id)
+        raft::update_device(edge_id_->back().begin() + current_pos_,
+                            edge_id->begin() + pos,
+                            copy_count,
+                            handle.get_stream());
+      if (edge_type)
+        raft::update_device(edge_type_->back().begin() + current_pos_,
+                            edge_type->begin() + pos,
+                            copy_count,
+                            handle.get_stream());
+
+      count -= copy_count;
+      pos += copy_count;
+      current_pos_ += copy_count;
+
+      if (current_pos_ == src_.back().size()) { create_new_buffers(handle); }
+    }
+
+    handle.raft_handle().sync_stream();
+  }
+
+  /**
+   * @brief  Mark the edgelist as ready for reading (all writes are complete)
+   *
+   * @param handle     The resource handle
+   */
+  void finalize_buffer(handle_t const& handle)
+  {
+    src_.back().resize(current_pos_, handle.get_stream());
+    dst_.back().resize(current_pos_, handle.get_stream());
+    if (wgt_) wgt_->back().resize(current_pos_, handle.get_stream());
+    if (edge_id_) edge_id_->back().resize(current_pos_, handle.get_stream());
+    if (edge_type_) edge_type_->back().resize(current_pos_, handle.get_stream());
+  }
+
+  bool use_weight() const { return wgt_.has_value(); }
+
+  bool use_edge_id() const { return edge_id_.has_value(); }
+
+  bool use_edge_type() const { return edge_type_.has_value(); }
+
+  std::vector<rmm::device_uvector<vertex_t>>& get_src() { return src_; }
+  std::vector<rmm::device_uvector<vertex_t>>& get_dst() { return dst_; }
+  std::optional<std::vector<rmm::device_uvector<weight_t>>>& get_wgt() { return wgt_; }
+  std::optional<std::vector<rmm::device_uvector<edge_t>>>& get_edge_id() { return edge_id_; }
+  std::optional<std::vector<rmm::device_uvector<edge_type_t>>>& get_edge_type()
+  {
+    return edge_type_;
+  }
+
+  /**
+   * @brief Consolidate edgelists (if necessary) and shuffle to the proper GPU
+   *
+   * @param handle    The resource handle
+   */
+  void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed)
+  {
+    if (src_.size() > 1) {
+      size_t total_size = std::transform_reduce(
+        src_.begin(), src_.end(), size_t{0}, std::plus<size_t>(), [](auto& d_vector) {
+          return d_vector.size();
+        });
+
+      resize_and_copy_buffers(handle.get_stream(), src_, total_size);
+      resize_and_copy_buffers(handle.get_stream(), dst_, total_size);
+      if (wgt_) resize_and_copy_buffers(handle.get_stream(), *wgt_, total_size);
+      if (edge_id_) resize_and_copy_buffers(handle.get_stream(), *edge_id_, total_size);
+      if (edge_type_) resize_and_copy_buffers(handle.get_stream(), *edge_type_, total_size);
+    }
+
+    auto tmp_wgt     = wgt_ ? std::make_optional(std::move((*wgt_)[0])) : std::nullopt;
+    auto tmp_edge_id = edge_id_ ? std::make_optional(std::move((*edge_id_)[0])) : std::nullopt;
+    auto tmp_edge_type =
+      edge_type_ ? std::make_optional(std::move((*edge_type_)[0])) : std::nullopt;
+
+    std::tie(store_transposed ? dst_[0] : src_[0],
+             store_transposed ? src_[0] : dst_[0],
+             tmp_wgt,
+             tmp_edge_id,
+             tmp_edge_type) =
+      cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+        handle.raft_handle(),
+        store_transposed ? std::move(dst_[0]) : std::move(src_[0]),
+        store_transposed ? std::move(src_[0]) : std::move(dst_[0]),
+        std::move(tmp_wgt),
+        std::move(tmp_edge_id),
+        std::move(tmp_edge_type));
+
+    if (tmp_wgt) ((*wgt_)[0]) = std::move(*tmp_wgt);
+    if (tmp_edge_id) ((*edge_id_)[0]) = std::move(*tmp_edge_id);
+    if (tmp_edge_type) ((*edge_type_)[0]) = std::move(*tmp_edge_type);
+  }
+
+ private:
+  template <typename T>
+  void resize_and_copy_buffers(rmm::cuda_stream_view stream,
+                               std::vector<rmm::device_uvector<T>>& buffer,
+                               size_t total_size)
+  {
+    size_t pos = buffer[0].size();
+    buffer[0].resize(total_size, stream);
+
+    for (size_t i = 1; i < buffer.size(); ++i) {
+      raft::copy(buffer[0].data() + pos, buffer[i].data(), buffer[i].size(), stream);
+      pos += buffer[i].size();
+      buffer[i].resize(0, stream);
+      buffer[i].shrink_to_fit(stream);
+    }
+
+    std::vector<rmm::device_uvector<T>> new_buffer;
+    new_buffer.push_back(std::move(buffer[0]));
+    buffer = std::move(new_buffer);
+  }
+
+  void create_new_buffers(cugraph::mtmg::handle_t const& handle)
+  {
+    src_.emplace_back(device_buffer_size_, handle.get_stream());
+    dst_.emplace_back(device_buffer_size_, handle.get_stream());
+
+    if (wgt_) { wgt_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    if (edge_id_) { edge_id_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    if (edge_type_) { edge_type_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    current_pos_ = 0;
+  }
+
+  mutable std::mutex lock_{};
+
+  size_t current_pos_{0};
+  size_t device_buffer_size_{0};
+
+  std::vector<rmm::device_uvector<vertex_t>> src_{};
+  std::vector<rmm::device_uvector<vertex_t>> dst_{};
+  std::optional<std::vector<rmm::device_uvector<weight_t>>> wgt_{};
+  std::optional<std::vector<rmm::device_uvector<edge_t>>> edge_id_{};
+  std::optional<std::vector<rmm::device_uvector<edge_type_t>>> edge_type_{};
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edge_property.hpp b/cpp/include/cugraph/mtmg/edge_property.hpp
new file mode 100644
index 00000000000..afa72492b9a
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edge_property.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/edge_property_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edge property object for each GPU
+ */
+template <typename graph_view_t, typename property_t>
+class edge_property_t : public detail::device_shared_wrapper_t<
+                          cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>> {
+ public:
+  using parent_t = detail::device_shared_wrapper_t<
+    cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>>;
+
+  /**
+   * @brief Return a edge_property_view_t (read only)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    using edge_t = typename graph_view_t::wrapped_t::edge_type;
+    using buffer_t =
+      typename cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>::buffer_type;
+    std::vector<buffer_t> buffers{};
+    using const_value_iterator_t = decltype(get_dataframe_buffer_cbegin(buffers[0]));
+
+    edge_property_view_t<edge_t, const_value_iterator_t> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, p.second.view());
+    });
+
+    return result;
+  }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edge_property_view.hpp b/cpp/include/cugraph/mtmg/edge_property_view.hpp
new file mode 100644
index 00000000000..c84a6458e1d
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edge_property_view.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edge property object for each GPU
+ */
+template <typename edge_t, typename value_iterator_t>
+using edge_property_view_t =
+  detail::device_shared_wrapper_t<cugraph::edge_property_view_t<edge_t, value_iterator_t>>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edgelist.hpp b/cpp/include/cugraph/mtmg/edgelist.hpp
new file mode 100644
index 00000000000..90c53dfbb64
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edgelist.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/detail/per_device_edgelist.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edgelist object for each GPU
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class edgelist_t : public detail::device_shared_wrapper_t<
+                     detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>> {
+ public:
+  /**
+   * @brief Create a per_device_edgelist for this GPU
+   */
+  void set(handle_t const& handle,
+           size_t device_buffer_size,
+           bool use_weight,
+           bool use_edge_id,
+           bool use_edge_type)
+  {
+    detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> tmp(
+      handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+
+    detail::device_shared_wrapper_t<
+      detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>>::set(handle,
+                                                                                   std::move(tmp));
+  }
+
+  /**
+   * @brief Stop inserting edges into this edgelist so we can use the edges
+   */
+  void finalize_buffer(handle_t const& handle) { this->get(handle).finalize_buffer(handle); }
+
+  /**
+   * @brief Consolidate for the edgelist edges into a single edgelist and then
+   *        shuffle across GPUs.
+   */
+  void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed)
+  {
+    this->get(handle).consolidate_and_shuffle(handle, store_transposed);
+  }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/graph.hpp b/cpp/include/cugraph/mtmg/graph.hpp
new file mode 100644
index 00000000000..76a2f401425
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/graph.hpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/edge_property.hpp>
+#include <cugraph/mtmg/graph_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Graph object for each GPU
+ */
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+class graph_t : public detail::device_shared_wrapper_t<
+                  cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>> {
+  using parent_t = detail::device_shared_wrapper_t<
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+
+ public:
+  /**
+   * @brief Create an MTMG graph view (read only)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, std::move(p.second.view()));
+    });
+
+    return result;
+  }
+};
+
+/**
+ * @brief Create an MTMG graph from an edgelist
+ *
+ * @param[in]  handle             Resource handle
+ * @param[in]  edgelist           Edgelist
+ * @param[in]  graph_properties   Graph properties
+ * @param[in]  renumber           If true, renumber graph (must be true for MG)
+ * @param[out] graph              MTMG graph is stored here
+ * @param[out] edge_weights       MTMG edge weights is stored here
+ * @param[out] edge_ids           MTMG edge ids is stored here
+ * @param[out] edge_types         MTMG edge types is stored here
+ * @param[in]  renumber_map       MTMG renumber_map is stored here
+ * @param[in]  do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+void create_graph_from_edgelist(
+  handle_t const& handle,
+  cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_id_t, edge_type_t>& edgelist,
+  graph_properties_t graph_properties,
+  bool renumber,
+  cugraph::mtmg::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>& graph,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    weight_t>>& edge_weights,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    edge_id_t>>& edge_ids,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    edge_type_t>>& edge_types,
+  std::optional<cugraph::mtmg::renumber_map_t<vertex_t>>& renumber_map,
+  bool do_expensive_check = false)
+{
+  if (handle.get_thread_rank() > 0) return;
+
+  CUGRAPH_EXPECTS(renumber_map.has_value() == renumber,
+                  "Renumbering set to true, but no space for renumber map");
+
+  auto& my_edgelist = edgelist.get(handle);
+
+  CUGRAPH_EXPECTS(my_edgelist.get_src().size() > 0, "Cannot create graph without an edge list");
+  CUGRAPH_EXPECTS(my_edgelist.get_src().size() == 1,
+                  "Must consolidate edges into a single list before creating graph");
+
+  auto [local_graph, local_edge_weights, local_edge_ids, local_edge_types, local_renumber_map] =
+    cugraph::create_graph_from_edgelist<vertex_t,
+                                        edge_t,
+                                        weight_t,
+                                        edge_id_t,
+                                        edge_type_t,
+                                        store_transposed,
+                                        multi_gpu>(
+      handle.raft_handle(),
+      std::nullopt,
+      std::move(my_edgelist.get_src()[0]),
+      std::move(my_edgelist.get_dst()[0]),
+      my_edgelist.get_wgt() ? std::make_optional(std::move((*my_edgelist.get_wgt())[0]))
+                            : std::nullopt,
+      my_edgelist.get_edge_id() ? std::make_optional(std::move((*my_edgelist.get_edge_id())[0]))
+                                : std::nullopt,
+      my_edgelist.get_edge_type() ? std::make_optional(std::move((*my_edgelist.get_edge_type())[0]))
+                                  : std::nullopt,
+      graph_properties,
+      renumber,
+      do_expensive_check);
+
+  graph.set(handle, std::move(local_graph));
+  if (edge_weights) edge_weights->set(handle, std::move(*local_edge_weights));
+  if (edge_ids) edge_ids->set(handle, std::move(*local_edge_ids));
+  if (edge_types) edge_types->set(handle, std::move(*local_edge_types));
+  if (renumber) renumber_map->set(handle, std::move(*local_renumber_map));
+}
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/graph_view.hpp b/cpp/include/cugraph/mtmg/graph_view.hpp
new file mode 100644
index 00000000000..94347e016ea
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/graph_view.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Graph view for each GPU
+ */
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+using graph_view_t = detail::device_shared_wrapper_t<
+  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/handle.hpp b/cpp/include/cugraph/mtmg/handle.hpp
new file mode 100644
index 00000000000..f23bce5aeac
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/handle.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Resource handler
+ *
+ * Multi-threaded resource handler.  Every GPU gets a raft::handle object that provides access to
+ * the GPU resources.  In a multi-threaded environment multiple threads will share a particular GPU.
+ * Following the MPI model, each thread will be assigned to a thread rank.
+ *
+ */
+class handle_t {
+ public:
+  /**
+   * @brief Constructor
+   *
+   * @param raft_handle   Raft handle for the resources
+   * @param thread_rank   Rank for this thread
+   */
+  handle_t(raft::handle_t const& raft_handle, int thread_rank, size_t device_id)
+    : raft_handle_(raft_handle),
+      thread_rank_(thread_rank),
+      local_rank_(raft_handle.get_comms().get_rank()),  // FIXME: update for multi-node
+      device_id_(device_id)
+  {
+  }
+
+  /**
+   * @brief Get the raft handle
+   *
+   * @return const reference to a raft handle
+   */
+  raft::handle_t const& raft_handle() const { return raft_handle_; }
+
+  /**
+   * @brief Get cuda stream
+   *
+   * @return cuda stream
+   */
+  rmm::cuda_stream_view get_stream() const
+  {
+    return raft_handle_.is_stream_pool_initialized()
+             ? raft_handle_.get_stream_from_stream_pool(device_id_)
+             : raft_handle_.get_stream();
+  }
+
+  /**
+   * @brief Get thread rank
+   *
+   * @return thread rank
+   */
+  int get_thread_rank() const { return thread_rank_; }
+
+  /**
+   * @brief Get number of gpus
+   *
+   * @return number of gpus
+   */
+  int get_size() const { return raft_handle_.get_comms().get_size(); }
+
+  /**
+   * @brief Get number of local gpus
+   *
+   * @return number of local gpus
+   */
+  // FIXME: wrong for multi-node
+  int get_local_size() const { return raft_handle_.get_comms().get_size(); }
+
+  /**
+   * @brief Get gpu rank
+   *
+   * @return gpu rank
+   */
+  int get_rank() const { return raft_handle_.get_comms().get_rank(); }
+
+  /**
+   * @brief Get local gpu rank
+   *
+   * @return local gpu rank
+   */
+  int get_local_rank() const { return local_rank_; }
+
+ private:
+  raft::handle_t const& raft_handle_;
+  int thread_rank_;
+  int local_rank_;
+  size_t device_id_;
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/instance_manager.hpp b/cpp/include/cugraph/mtmg/instance_manager.hpp
new file mode 100644
index 00000000000..8bf62b56f4b
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/instance_manager.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+
+#include <nccl.h>
+
+#include <vector>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Manages a subset of the cluster for a set of graph computations
+ */
+class instance_manager_t {
+ public:
+  /**
+   * @brief Constructor
+   *
+   * @param handles   Vector of RAFT handles, one for each device on this node
+   */
+  instance_manager_t(std::vector<std::unique_ptr<raft::handle_t>>&& handles,
+                     std::vector<std::unique_ptr<ncclComm_t>>&& nccl_comms,
+                     std::vector<rmm::cuda_device_id>&& device_ids,
+                     int local_gpu_count)
+    : thread_counter_{0},
+      raft_handle_{std::move(handles)},
+      nccl_comms_{std::move(nccl_comms)},
+      device_ids_{std::move(device_ids)},
+      local_gpu_count_{local_gpu_count}
+  {
+  }
+
+  /**
+   * @brief Get handle
+   *
+   * The instance manager will construct a handle appropriate for the thread making
+   * the request.  Threads will be assigned to GPUs in a round-robin fashion to
+   * spread requesting threads around the GPU resources.
+   *
+   * This function will be CPU thread-safe.
+   *
+   * @return a handle for this thread.
+   */
+  handle_t get_handle()
+  {
+    int local_id = thread_counter_++;
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_ids_[local_id % raft_handle_.size()].value()));
+    return handle_t(*raft_handle_[local_id % raft_handle_.size()],
+                    local_id / raft_handle_.size(),
+                    static_cast<size_t>(local_id % raft_handle_.size()));
+  }
+
+  /**
+   * @brief Reset the thread counter
+   *
+   * After a parallel activity is completed, we need to reset the thread counter so that
+   * future threads will round robin around the GPUs properly.
+   */
+  void reset_threads() { thread_counter_.store(0); }
+
+  /**
+   * @brief Number of local GPUs in the instance
+   */
+  int get_local_gpu_count() { return local_gpu_count_; }
+
+ private:
+  // FIXME: Should this be an std::map<> where the key is the rank?
+  //        On a multi-node system we might have nodes with fewer
+  //        (or no) GPUs, so mapping rank to a handle might be a challenge
+  //
+  std::vector<std::unique_ptr<raft::handle_t>> raft_handle_{};
+  std::vector<std::unique_ptr<ncclComm_t>> nccl_comms_{};
+  std::vector<rmm::cuda_device_id> device_ids_{};
+  int local_gpu_count_{};
+
+  std::atomic<int> thread_counter_{0};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
new file mode 100644
index 00000000000..b672db48719
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/detail/per_device_edgelist.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Supports creating an edgelist from individual host threads
+ *
+ * A cugraph edgelist needs to contain all of the edges necessary to create the graph
+ * stored in GPU memory (distributed across multiple GPUs in a multi-GPU configuration).
+ *
+ * This class provides a mechanism for populating the edgelist object from independent CPU threads.
+ *
+ * Calls to the append() method will take edges (in CPU host memory) and append them to a local
+ * buffer.  As the local buffer fills, the buffer will be sent to GPU memory using the flush()
+ * method.  This allows the CPU to GPU transfers to be larger (and consequently more efficient).
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class per_thread_edgelist_t {
+ public:
+  per_thread_edgelist_t()                             = delete;
+  per_thread_edgelist_t(per_thread_edgelist_t const&) = delete;
+
+  /**
+   * @brief Only constructor
+   *
+   * @param edgelist            The edge list this thread_edgelist_t should be associated with
+   * @param thread_buffer_size  Size of the local buffer for accumulating edges on the CPU
+   */
+  per_thread_edgelist_t(
+    detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>& edgelist,
+    size_t thread_buffer_size)
+    : edgelist_{edgelist},
+      current_pos_{0},
+      src_(thread_buffer_size),
+      dst_(thread_buffer_size),
+      wgt_{std::nullopt},
+      edge_id_{std::nullopt},
+      edge_type_{std::nullopt}
+  {
+    if (edgelist.use_weight()) wgt_ = std::make_optional(std::vector<weight_t>(thread_buffer_size));
+
+    if (edgelist.use_edge_id())
+      edge_id_ = std::make_optional(std::vector<edge_t>(thread_buffer_size));
+
+    if (edgelist.use_edge_type())
+      edge_type_ = std::make_optional(std::vector<edge_type_t>(thread_buffer_size));
+  }
+
+  /**
+   * @brief Append an edge to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              vertex_t src,
+              vertex_t dst,
+              std::optional<weight_t> wgt,
+              std::optional<edge_t> edge_id,
+              std::optional<edge_type_t> edge_type)
+  {
+    if (current_pos_ == src_.size()) { flush(handle); }
+
+    src_[current_pos_] = src;
+    dst_[current_pos_] = dst;
+    if (wgt) (*wgt_)[current_pos_] = *wgt;
+    if (edge_id) (*edge_id_)[current_pos_] = *edge_id;
+    if (edge_type) (*edge_type_)[current_pos_] = *edge_type;
+
+    ++current_pos_;
+  }
+
+  /**
+   * @brief Append a list of edges to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              raft::host_span<vertex_t const> src,
+              raft::host_span<vertex_t const> dst,
+              std::optional<raft::host_span<weight_t const>> wgt,
+              std::optional<raft::host_span<edge_t const>> edge_id,
+              std::optional<raft::host_span<edge_type_t const>> edge_type)
+  {
+    size_t count = src.size();
+    size_t pos   = 0;
+
+    while (count > 0) {
+      size_t copy_count = std::min(count, (src_.size() - current_pos_));
+
+      std::copy(src.begin() + pos, src.begin() + pos + copy_count, src_.begin() + current_pos_);
+      std::copy(dst.begin() + pos, dst.begin() + pos + copy_count, dst_.begin() + current_pos_);
+      if (wgt)
+        std::copy(wgt.begin() + pos, wgt.begin() + pos + copy_count, wgt_->begin() + current_pos_);
+      if (edge_id)
+        std::copy(edge_id.begin() + pos,
+                  edge_id.begin() + pos + copy_count,
+                  edge_id_->begin() + current_pos_);
+      if (edge_type)
+        std::copy(edge_type.begin() + pos,
+                  edge_type.begin() + pos + copy_count,
+                  edge_type_->begin() + current_pos_);
+
+      if (current_pos_ == src_.size()) { flush(handle); }
+
+      count -= copy_count;
+      pos += copy_count;
+    }
+  }
+
+  /**
+   * @brief Flush thread data from host to GPU memory
+   *
+   * @param handle     The resource handle
+   */
+  void flush(handle_t const& handle)
+  {
+    edgelist_.append(
+      handle,
+      raft::host_span<vertex_t const>{src_.data(), current_pos_},
+      raft::host_span<vertex_t const>{dst_.data(), current_pos_},
+      wgt_ ? std::make_optional(raft::host_span<weight_t const>{wgt_->data(), current_pos_})
+           : std::nullopt,
+      edge_id_ ? std::make_optional(raft::host_span<edge_t const>{edge_id_->data(), current_pos_})
+               : std::nullopt,
+      edge_type_
+        ? std::make_optional(raft::host_span<edge_type_t const>{edge_type_->data(), current_pos_})
+        : std::nullopt);
+
+    current_pos_ = 0;
+  }
+
+ private:
+  detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>& edgelist_;
+  size_t current_pos_{0};
+  std::vector<vertex_t> src_{};
+  std::vector<vertex_t> dst_{};
+  std::optional<std::vector<weight_t>> wgt_{};
+  std::optional<std::vector<edge_t>> edge_id_{};
+  std::optional<std::vector<edge_type_t>> edge_type_{};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/renumber_map.hpp b/cpp/include/cugraph/mtmg/renumber_map.hpp
new file mode 100644
index 00000000000..da07d61bd96
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/renumber_map.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_vector.hpp>
+#include <cugraph/mtmg/renumber_map_view.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device vector for storing a renumber map
+ */
+template <typename vertex_t>
+class renumber_map_t : public detail::device_shared_device_vector_t<vertex_t> {
+  using parent_t = detail::device_shared_device_vector_t<vertex_t>;
+
+ public:
+  /**
+   * @brief Return a view (read only) of the renumber map
+   */
+  auto view() { return static_cast<renumber_map_view_t<vertex_t>>(this->parent_t::view()); }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/renumber_map_view.hpp b/cpp/include/cugraph/mtmg/renumber_map_view.hpp
new file mode 100644
index 00000000000..5ff7ff5e100
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/renumber_map_view.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device span for storing a renumber map
+ */
+template <typename vertex_t>
+using renumber_map_view_t = detail::device_shared_device_span_t<vertex_t const>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/resource_manager.hpp b/cpp/include/cugraph/mtmg/resource_manager.hpp
new file mode 100644
index 00000000000..b4633626e7c
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/resource_manager.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/instance_manager.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/std_comms.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <execution>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Class for managing local and remote GPU resources for use in
+ *   multi-threaded multi-GPU interface.
+ *
+ * Each process in a multi-GPU configuration should have an instance of this
+ * class.  The resource manager object should be configured by calling
+ * register_local_gpu (or register_remote_gpu once we support a multi-node
+ * configuration) to allocate resources that can be used in the mtmg space.
+ *
+ * When we want to execute some graph computations, we need to create an instance for execution.
+ * Based on how big a subset of the desired compute resources is desired, we can allocate some
+ * number of GPUs to the problem (up to the total set of managed resources).
+ *
+ * The returned instance can be used to create a graph, execute one or more algorithms, etc.  Once
+ * we are done the caller can delete the instance.
+ *
+ * At the moment, the caller is assumed to be responsible for scheduling use of the resources.
+ *
+ * For our first release, we will only consider a single node multi-GPU configuration, so the remote
+ * GPU methods are currently disabled via ifdef.
+ */
+class resource_manager_t {
+ public:
+  /**
+   * @brief Default constructor
+   */
+  resource_manager_t() {}
+
+  /**
+   * @brief add a local GPU to the resource manager.
+   *
+   * @param rank       The rank to assign to the local GPU
+   * @param device_id  The device_id corresponding to this rank
+   */
+  void register_local_gpu(int rank, rmm::cuda_device_id device_id)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    CUGRAPH_EXPECTS(local_rank_map_.find(rank) == local_rank_map_.end(),
+                    "cannot register same rank multiple times");
+
+    int num_gpus_this_node;
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_this_node));
+
+    CUGRAPH_EXPECTS((device_id.value() >= 0) && (device_id.value() < num_gpus_this_node),
+                    "device id out of range");
+
+    local_rank_map_.insert(std::pair(rank, device_id));
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_id.value()));
+
+    // FIXME: There is a bug in the cuda_memory_resource that results in a Hang.
+    //   using the pool resource as a work-around.
+    //
+    // There is a deprecated environment variable: NCCL_LAUNCH_MODE=GROUP
+    // which should temporarily work around this problem.
+    //
+    // Ultimately there should be some RMM parameters passed into this function
+    // (or the constructor of the object) to configure this behavior
+#if 0
+    auto per_device_it = per_device_rmm_resources_.insert(
+      std::pair{rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
+#else
+    auto const [free, total] = rmm::detail::available_device_memory();
+    auto const min_alloc =
+      rmm::detail::align_down(std::min(free, total / 6), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+
+    auto per_device_it = per_device_rmm_resources_.insert(
+      std::pair{rank,
+                rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+                  std::make_shared<rmm::mr::cuda_memory_resource>(), min_alloc)});
+#endif
+
+    rmm::mr::set_per_device_resource(device_id, per_device_it.first->second.get());
+  }
+
+  /**
+   * @brief Create an instance using a subset of the registered resources
+   *
+   * The selected set of resources will be configured as an instance manager.
+   * If @ranks_to_include is a proper subset of the registered resources,
+   * ranks will be renumbered into the range [0, @p ranks_to_use.size()), making
+   * it a proper configuration.
+   *
+   * @param ranks_to_use        a vector containing the ranks to include in the instance.
+   *   Must be a subset of the entire set of available ranks.
+   * @param instance_manager_id a ncclUniqueId that is shared by all processes participating
+   *   in this instance.  All processes must use the same ID in this call, it is up
+   *   to the calling code to share this ID properly before the call.
+   *
+   * @return unique pointer to instance manager
+   */
+  std::unique_ptr<instance_manager_t> create_instance_manager(
+    std::vector<int> ranks_to_include, ncclUniqueId instance_manager_id) const
+  {
+    std::for_each(
+      ranks_to_include.begin(), ranks_to_include.end(), [local_ranks = local_rank_map_](int rank) {
+        CUGRAPH_EXPECTS(local_ranks.find(rank) != local_ranks.end(),
+                        "requesting inclusion of an invalid rank");
+      });
+
+    std::vector<std::unique_ptr<ncclComm_t>> nccl_comms{};
+    std::vector<std::unique_ptr<raft::handle_t>> handles{};
+    std::vector<rmm::cuda_device_id> device_ids{};
+
+    nccl_comms.reserve(ranks_to_include.size());
+    handles.reserve(ranks_to_include.size());
+    device_ids.reserve(ranks_to_include.size());
+
+    // FIXME: not quite right for multi-node
+    auto gpu_row_comm_size = static_cast<int>(sqrt(static_cast<double>(ranks_to_include.size())));
+    while (ranks_to_include.size() % gpu_row_comm_size != 0) {
+      --gpu_row_comm_size;
+    }
+
+    // FIXME: not quite right for multi-node
+    for (size_t i = 0; i < ranks_to_include.size(); ++i) {
+      int rank = ranks_to_include[i];
+      auto pos = local_rank_map_.find(rank);
+      RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+
+      raft::handle_t tmp_handle;
+
+      nccl_comms.push_back(std::make_unique<ncclComm_t>());
+      handles.push_back(
+        std::make_unique<raft::handle_t>(tmp_handle, per_device_rmm_resources_.find(rank)->second));
+      device_ids.push_back(pos->second);
+    }
+
+    std::vector<std::thread> running_threads;
+
+    for (size_t i = 0; i < ranks_to_include.size(); ++i) {
+      running_threads.emplace_back([instance_manager_id,
+                                    idx = i,
+                                    gpu_row_comm_size,
+                                    comm_size = ranks_to_include.size(),
+                                    &ranks_to_include,
+                                    &local_rank_map = local_rank_map_,
+                                    &nccl_comms,
+                                    &handles]() {
+        int rank = ranks_to_include[idx];
+        auto pos = local_rank_map.find(rank);
+        RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+
+        NCCL_TRY(ncclCommInitRank(nccl_comms[idx].get(), comm_size, instance_manager_id, rank));
+
+        raft::comms::build_comms_nccl_only(handles[idx].get(), *nccl_comms[idx], comm_size, rank);
+
+        cugraph::partition_manager::init_subcomm(*handles[idx], gpu_row_comm_size);
+      });
+    }
+
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+
+    // FIXME: Update for multi-node
+    return std::make_unique<instance_manager_t>(
+      std::move(handles), std::move(nccl_comms), std::move(device_ids), ranks_to_include.size());
+  }
+
+  /**
+   * @brief Get a list of all of the currently registered ranks
+   *
+   * @return A copy of the list of ranks.
+   */
+  std::vector<int> registered_ranks() const
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    //
+    // C++20 mechanism:
+    // return std::vector<int>{ std::views::keys(local_rank_map_).begin(),
+    //                          std::views::keys(local_rank_map_).end() };
+    //  Would need a bit more complicated to handle remote_rank_map_ also
+    //
+    std::vector<int> registered_ranks(local_rank_map_.size());
+    std::transform(
+      local_rank_map_.begin(), local_rank_map_.end(), registered_ranks.begin(), [](auto pair) {
+        return pair.first;
+      });
+
+    return registered_ranks;
+  }
+
+ private:
+  mutable std::mutex lock_{};
+  std::map<int, rmm::cuda_device_id> local_rank_map_{};
+  std::map<int, std::shared_ptr<rmm::mr::device_memory_resource>> per_device_rmm_resources_{};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_result.hpp b/cpp/include/cugraph/mtmg/vertex_result.hpp
new file mode 100644
index 00000000000..e8999b35aa9
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_result.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_vector.hpp>
+#include <cugraph/mtmg/vertex_result_view.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device vector for storing vertex results
+ */
+template <typename result_t>
+class vertex_result_t : public detail::device_shared_device_vector_t<result_t> {
+  using parent_t = detail::device_shared_device_vector_t<result_t>;
+
+ public:
+  /**
+   * @brief Create a vertex result view (read only)
+   */
+  auto view() { return static_cast<vertex_result_view_t<result_t>>(this->parent_t::view()); }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_result_view.hpp b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
new file mode 100644
index 00000000000..7a7070d6f2a
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <cugraph/mtmg/graph_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device span for referencing a vertex result
+ */
+template <typename result_t>
+class vertex_result_view_t : public detail::device_shared_device_span_t<result_t const> {
+  using parent_t = detail::device_shared_device_span_t<result_t const>;
+
+ public:
+  vertex_result_view_t(parent_t&& other) : parent_t{std::move(other)} {}
+
+  /**
+   * @brief Gather results from specified vertices into a device vector
+   */
+  template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+  rmm::device_uvector<result_t> gather(
+    handle_t const& handle,
+    raft::device_span<vertex_t const> vertices,
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view);
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 3a84cdedfda..92c70fcff20 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -388,9 +388,11 @@ void pagerank(raft::handle_t const& handle,
     handle,
     graph_view,
     edge_weight_view,
-    std::make_optional(raft::device_span<weight_t const>{
-      *precomputed_vertex_out_weight_sums,
-      static_cast<size_t>(graph_view.local_vertex_partition_range_size())}),
+    precomputed_vertex_out_weight_sums
+      ? std::make_optional(raft::device_span<weight_t const>{
+          *precomputed_vertex_out_weight_sums,
+          static_cast<size_t>(graph_view.local_vertex_partition_range_size())})
+      : std::nullopt,
     personalization_vertices
       ? std::make_optional(std::make_tuple(
           raft::device_span<vertex_t const>{*personalization_vertices,
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
new file mode 100644
index 00000000000..a669a127f41
--- /dev/null
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/mtmg/vertex_result_view.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <detail/graph_partition_utils.cuh>
+
+#include <thrust/gather.h>
+
+namespace cugraph {
+namespace mtmg {
+
+template <typename result_t>
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
+  handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view)
+{
+  auto this_gpu_graph_view = graph_view.get(handle);
+
+  rmm::device_uvector<vertex_t> local_vertices(vertices.size(), handle.get_stream());
+  rmm::device_uvector<int> vertex_gpu_ids(vertices.size(), handle.get_stream());
+  rmm::device_uvector<size_t> vertex_pos(vertices.size(), handle.get_stream());
+  rmm::device_uvector<result_t> result(vertices.size(), handle.get_stream());
+
+  raft::copy(local_vertices.data(), vertices.data(), vertices.size(), handle.get_stream());
+  cugraph::detail::scalar_fill(
+    handle.get_stream(), vertex_gpu_ids.data(), vertex_gpu_ids.size(), handle.get_rank());
+  cugraph::detail::sequence_fill(
+    handle.get_stream(), vertex_pos.data(), vertex_pos.size(), size_t{0});
+
+  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+    this_gpu_graph_view.vertex_partition_range_lasts().size(), handle.get_stream());
+  raft::update_device(d_vertex_partition_range_lasts.data(),
+                      this_gpu_graph_view.vertex_partition_range_lasts().data(),
+                      this_gpu_graph_view.vertex_partition_range_lasts().size(),
+                      handle.get_stream());
+
+  if (renumber_map_view) {
+    cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
+      handle.raft_handle(),
+      local_vertices.data(),
+      local_vertices.size(),
+      renumber_map_view->get(handle).data(),
+      this_gpu_graph_view.local_vertex_partition_range_first(),
+      this_gpu_graph_view.local_vertex_partition_range_last());
+  }
+
+  auto const major_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::major_comm_name()).get_size();
+  auto const minor_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size();
+
+  std::forward_as_tuple(local_vertices, std::tie(vertex_gpu_ids, vertex_pos), std::ignore) =
+    groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.raft_handle().get_comms(),
+      local_vertices.begin(),
+      local_vertices.end(),
+      thrust::make_zip_iterator(vertex_gpu_ids.begin(), vertex_pos.begin()),
+      cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size},
+      handle.get_stream());
+
+  //
+  //  Now gather
+  //
+  rmm::device_uvector<result_t> tmp_result(local_vertices.size(), handle.get_stream());
+
+  auto& wrapped = this->get(handle);
+
+  auto vertex_partition = vertex_partition_device_view_t<vertex_t, multi_gpu>(
+    this_gpu_graph_view.local_vertex_partition_view());
+
+  auto iter =
+    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+      return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+    });
+
+  thrust::gather(handle.raft_handle().get_thrust_policy(),
+                 iter,
+                 iter + local_vertices.size(),
+                 wrapped.begin(),
+                 tmp_result.begin());
+
+  //
+  // Shuffle back
+  //
+  std::forward_as_tuple(std::ignore, std::tie(std::ignore, vertex_pos, tmp_result), std::ignore) =
+    groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.raft_handle().get_comms(),
+      vertex_gpu_ids.begin(),
+      vertex_gpu_ids.end(),
+      thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
+      [] __device__(int gpu) { return gpu; },
+      handle.get_stream());
+
+  //
+  // Finally, reorder result
+  //
+  thrust::scatter(handle.raft_handle().get_thrust_policy(),
+                  tmp_result.begin(),
+                  tmp_result.end(),
+                  vertex_pos.begin(),
+                  result.begin());
+
+  return result;
+}
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 5e1e1d6ace3..f08606df8ea 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -419,6 +419,14 @@ ConfigureTest(K_HOP_NBRS_TEST traversal/k_hop_nbrs_test.cpp)
 # - install tests ---------------------------------------------------------------------------------
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcugraph)
 
+###################################################################################################
+# - MTMG tests -------------------------------------------------------------------------
+ConfigureTest(MTMG_TEST mtmg/threaded_test.cu)
+target_link_libraries(MTMG_TEST
+                      PRIVATE
+                      UCP::UCP
+                    )
+
 ###################################################################################################
 # - MG tests --------------------------------------------------------------------------------------
 
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
new file mode 100644
index 00000000000..c5dc2d3c7ce
--- /dev/null
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/edgelist.hpp>
+#include <cugraph/mtmg/graph.hpp>
+#include <cugraph/mtmg/per_thread_edgelist.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+#include <cugraph/mtmg/resource_manager.hpp>
+#include <cugraph/mtmg/vertex_result.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <nccl.h>
+
+#include <vector>
+
+#include <thrust/count.h>
+#include <thrust/unique.h>
+
+struct Multithreaded_Usecase {
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Multithreaded
+  : public ::testing::TestWithParam<std::tuple<Multithreaded_Usecase, input_usecase_t>> {
+ public:
+  Tests_Multithreaded() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  std::vector<int> get_gpu_list()
+  {
+    int num_gpus_per_node{1};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+
+    std::vector<int> gpu_list(num_gpus_per_node);
+    std::iota(gpu_list.begin(), gpu_list.end(), 0);
+
+    return gpu_list;
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool multi_gpu>
+  void run_current_test(
+    std::tuple<Multithreaded_Usecase const&, input_usecase_t const&> const& param,
+    std::vector<int> gpu_list)
+  {
+    using edge_type_t = int32_t;
+
+    constexpr bool renumber           = true;
+    constexpr bool do_expensive_check = false;
+
+    auto [multithreaded_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+
+    result_t constexpr alpha{0.85};
+    result_t constexpr epsilon{1e-6};
+
+    size_t device_buffer_size{64 * 1024 * 1024};
+    size_t thread_buffer_size{4 * 1024 * 1024};
+
+    int num_gpus    = gpu_list.size();
+    int num_threads = num_gpus * 4;
+
+    cugraph::mtmg::resource_manager_t resource_manager;
+
+    std::for_each(gpu_list.begin(), gpu_list.end(), [&resource_manager](int gpu_id) {
+      resource_manager.register_local_gpu(gpu_id, rmm::cuda_device_id{gpu_id});
+    });
+
+    ncclUniqueId instance_manager_id;
+    ncclGetUniqueId(&instance_manager_id);
+
+    auto instance_manager = resource_manager.create_instance_manager(
+      resource_manager.registered_ranks(), instance_manager_id);
+
+    cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, true, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu> graph_view;
+    cugraph::mtmg::vertex_result_t<result_t> pageranks;
+    std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
+      std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
+
+    auto edge_weights = multithreaded_usecase.test_weighted
+                          ? std::make_optional<cugraph::mtmg::edge_property_t<
+                              cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                              weight_t>>()
+                          : std::nullopt;
+
+    //
+    // Simulate graph creation by spawning threads to walk through the
+    // local COO and add edges
+    //
+    std::vector<std::thread> running_threads;
+
+    //  Initialize shared edgelist object, one per GPU
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &edgelist,
+                                    device_buffer_size,
+                                    use_weight    = true,
+                                    use_edge_id   = false,
+                                    use_edge_type = false]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        edgelist.set(thread_handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    // Load SG edge list
+    auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] =
+      input_usecase.template construct_edgelist<vertex_t, weight_t>(
+        handle, multithreaded_usecase.test_weighted, false, false);
+
+    auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
+    auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
+    auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_vertices_v);
+
+    // Load edgelist from different threads.  We'll use more threads than GPUs here
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    thread_buffer_size,
+                                    &edgelist,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+        cugraph::mtmg::per_thread_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>
+          per_thread_edgelist(edgelist.get(thread_handle), thread_buffer_size);
+
+        for (size_t j = i; j < h_src_v.size(); j += num_threads) {
+#if 0
+          if (h_weights_v) {
+            thread_edgelist.append(
+              thread_handle, h_src_v[j], h_dst_v[j], (*h_weights_v)[j], std::nullopt, std::nullopt);
+          } else {
+            thread_edgelist.append(
+              thread_handle, h_src_v[j], h_dst_v[j], std::nullopt, std::nullopt, std::nullopt);
+          }
+#endif
+          per_thread_edgelist.append(
+            thread_handle,
+            h_src_v[j],
+            h_dst_v[j],
+            h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
+            std::nullopt,
+            std::nullopt);
+        }
+
+        per_thread_edgelist.flush(thread_handle);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph,
+                                    &edge_weights,
+                                    &edgelist,
+                                    &renumber_map,
+                                    &pageranks,
+                                    is_symmetric = is_symmetric,
+                                    renumber,
+                                    do_expensive_check]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+          edge_t>>
+          edge_ids{std::nullopt};
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+          int32_t>>
+          edge_types{std::nullopt};
+
+        edgelist.finalize_buffer(thread_handle);
+        edgelist.consolidate_and_shuffle(thread_handle, true);
+
+        cugraph::mtmg::
+          create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, true, multi_gpu>(
+            thread_handle,
+            edgelist,
+            cugraph::graph_properties_t{is_symmetric, true},
+            renumber,
+            graph,
+            edge_weights,
+            edge_ids,
+            edge_types,
+            renumber_map,
+            do_expensive_check);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    graph_view = graph.view();
+
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back(
+        [&instance_manager, &graph_view, &edge_weights, &pageranks, alpha, epsilon]() {
+          auto thread_handle = instance_manager->get_handle();
+
+          if (thread_handle.get_thread_rank() > 0) return;
+
+          auto [local_pageranks, metadata] =
+            cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, true>(
+              thread_handle.raft_handle(),
+              graph_view.get(thread_handle),
+              edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                           : std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              alpha,
+              epsilon,
+              500,
+              true);
+
+          pageranks.set(thread_handle, std::move(local_pageranks));
+        });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    std::vector<std::tuple<std::vector<vertex_t>, std::vector<result_t>>> computed_pageranks_v;
+    std::mutex computed_pageranks_lock{};
+
+    auto pageranks_view    = pageranks.view();
+    auto renumber_map_view = renumber_map ? std::make_optional(renumber_map->view()) : std::nullopt;
+
+    // Load computed_pageranks from different threads.
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &renumber_map_view,
+                                    &pageranks_view,
+                                    &computed_pageranks_lock,
+                                    &computed_pageranks_v,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    &unique_vertices,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        auto number_of_vertices = unique_vertices->size();
+
+        std::vector<vertex_t> my_vertex_list;
+        my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
+
+        for (size_t j = i; j < number_of_vertices; j += num_threads) {
+          my_vertex_list.push_back((*unique_vertices)[j]);
+        }
+
+        rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
+                                                       thread_handle.raft_handle().get_stream());
+        raft::update_device(d_my_vertex_list.data(),
+                            my_vertex_list.data(),
+                            my_vertex_list.size(),
+                            thread_handle.raft_handle().get_stream());
+
+        auto d_my_pageranks = pageranks_view.gather(
+          thread_handle,
+          raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
+          graph_view,
+          renumber_map_view);
+
+        std::vector<result_t> my_pageranks(d_my_pageranks.size());
+        raft::update_host(my_pageranks.data(),
+                          d_my_pageranks.data(),
+                          d_my_pageranks.size(),
+                          thread_handle.raft_handle().get_stream());
+
+        {
+          std::lock_guard<std::mutex> lock(computed_pageranks_lock);
+          computed_pageranks_v.push_back(
+            std::make_tuple(std::move(my_vertex_list), std::move(my_pageranks)));
+        }
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    if (multithreaded_usecase.check_correctness) {
+      // Want to compare the results in computed_pageranks_v with SG results
+      cugraph::graph_t<vertex_t, edge_t, true, false> sg_graph(handle);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::optional<rmm::device_uvector<vertex_t>> sg_renumber_map{std::nullopt};
+
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, sg_renumber_map) = cugraph::
+        create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, true, false>(
+          handle,
+          std::nullopt,
+          std::move(d_src_v),
+          std::move(d_dst_v),
+          std::move(d_weights_v),
+          std::nullopt,
+          std::nullopt,
+          cugraph::graph_properties_t{is_symmetric, true},
+          true);
+
+      auto [sg_pageranks, meta] = cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, false>(
+        handle,
+        sg_graph.view(),
+        sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        alpha,
+        epsilon);
+
+      auto h_sg_pageranks    = cugraph::test::to_host(handle, sg_pageranks);
+      auto h_sg_renumber_map = cugraph::test::to_host(handle, sg_renumber_map);
+      auto compare_functor   = cugraph::test::nearly_equal<weight_t>{
+        weight_t{1e-3},
+        weight_t{(weight_t{1} / static_cast<weight_t>(h_sg_pageranks.size())) * weight_t{1e-3}}};
+
+      std::for_each(
+        computed_pageranks_v.begin(),
+        computed_pageranks_v.end(),
+        [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t1) {
+          std::for_each(
+            thrust::make_zip_iterator(std::get<0>(t1).begin(), std::get<1>(t1).begin()),
+            thrust::make_zip_iterator(std::get<0>(t1).end(), std::get<1>(t1).end()),
+            [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t2) {
+              vertex_t v  = thrust::get<0>(t2);
+              weight_t pr = thrust::get<1>(t2);
+
+              auto pos    = std::find(h_sg_renumber_map->begin(), h_sg_renumber_map->end(), v);
+              auto offset = std::distance(h_sg_renumber_map->begin(), pos);
+
+              ASSERT_TRUE(compare_functor(pr, h_sg_pageranks[offset]))
+                << "vertex " << v << ", SG result = " << h_sg_pageranks[offset]
+                << ", mtmg result = " << pr << ", renumber map = " << (*h_sg_renumber_map)[offset];
+            });
+        });
+    }
+  }
+};
+
+using Tests_Multithreaded_File = Tests_Multithreaded<cugraph::test::File_Usecase>;
+using Tests_Multithreaded_Rmat = Tests_Multithreaded<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_Multithreaded_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+TEST_P(Tests_Multithreaded_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+INSTANTIATE_TEST_SUITE_P(file_test,
+                         Tests_Multithreaded_File,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(Multithreaded_Usecase{false, true},
+                                             Multithreaded_Usecase{true, true}),
+                           ::testing::Values(cugraph::test::File_Usecase("karate.csv"),
+                                             cugraph::test::File_Usecase("dolphins.csv"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(Multithreaded_Usecase{false, true}, Multithreaded_Usecase{true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_File,
+  ::testing::Combine(
+    // disable correctness checks
+    ::testing::Values(Multithreaded_Usecase{false, false}, Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Multithreaded_Usecase{false, false}, Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()

From 686c3727782c6d303385d7ecdb0330d890e8184d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Sep 2023 14:20:15 -0500
Subject: [PATCH 08/22] Update to clang 16.0.6. (#3859)

This PR updates cugraph to use clang 16.0.6. The previous version 16.0.1 has some minor formatting issues affecting several RAPIDS repos.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3859
---
 .pre-commit-config.yaml                        |  2 +-
 cpp/src/components/legacy/scc_matrix.cuh       |  2 +-
 cpp/src/cores/core_number_impl.cuh             |  2 +-
 ...er_v_random_select_transform_outgoing_e.cuh |  4 ++--
 cpp/src/sampling/random_walks.cuh              | 18 +++++++++---------
 cpp/src/structure/renumber_edgelist_impl.cuh   |  2 +-
 cpp/src/traversal/bfs_impl.cuh                 |  2 +-
 cpp/tests/prims/mg_extract_transform_e.cu      |  4 ++--
 cpp/tests/traversal/mg_sssp_test.cpp           |  2 +-
 cpp/tests/traversal/sssp_test.cpp              |  2 +-
 cpp/tests/utilities/test_utilities.hpp         |  2 +-
 11 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0f05aedf1a1..865d06b20e4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
         additional_dependencies:
           - flake8==6.0.0
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.1
+    rev: v16.0.6
     hooks:
       - id: clang-format
         exclude: |
diff --git a/cpp/src/components/legacy/scc_matrix.cuh b/cpp/src/components/legacy/scc_matrix.cuh
index 3d56bdc5bf4..d044123bed0 100644
--- a/cpp/src/components/legacy/scc_matrix.cuh
+++ b/cpp/src/components/legacy/scc_matrix.cuh
@@ -68,7 +68,7 @@ struct SCC_Data {
   SCC_Data(size_t nrows,
            const IndexT* p_d_r_o,  // row_offsets
            const IndexT* p_d_c_i)
-    :                              // column indices
+    :  // column indices
       nrows_(nrows),
       p_d_r_o_(p_d_r_o),
       p_d_c_i_(p_d_c_i),
diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh
index b63ae60f052..ea8e2a9c4ee 100644
--- a/cpp/src/cores/core_number_impl.cuh
+++ b/cpp/src/cores/core_number_impl.cuh
@@ -72,7 +72,7 @@ struct v_to_core_number_t {
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename edge_t>
 struct mult_degree_by_two_t {
-  __device__ edge_t operator()(edge_t d) const { return d* edge_t{2}; }
+  __device__ edge_t operator()(edge_t d) const { return d * edge_t{2}; }
 };
 
 }  // namespace
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 3375a651982..e6db21f1c7c 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -287,7 +287,7 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
 #ifndef NO_CUGRAPH_OPS
   edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
   assert(mid_partition_degree_range_last > K);
-  size_t high_partition_over_sampling_K = K * 2;                         // tuning parameter
+  size_t high_partition_over_sampling_K = K * 2;  // tuning parameter
   assert(high_partition_over_sampling_K > K);
 
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier_degrees.size() * K, handle.get_stream());
@@ -883,7 +883,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
                           // neighbors are distributed in minor_comm_size partitions)
   std::optional<rmm::device_uvector<size_t>> sample_key_indices{
-    std::nullopt};        // relevant only when (minor_comm_size > 1)
+    std::nullopt};  // relevant only when (minor_comm_size > 1)
   auto local_frontier_sample_counts        = std::vector<size_t>{};
   auto local_frontier_sample_displacements = std::vector<size_t>{};
   if (minor_comm_size > 1) {
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 46789c6b8bd..6a7334e9f1a 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -197,19 +197,19 @@ struct col_indx_extract_t {
   void operator()(
     original::device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
     original::device_vec_t<vertex_t> const&
-      d_v_col_indx,       // in: column indices, given by stepper's random engine
+      d_v_col_indx,  // in: column indices, given by stepper's random engine
     original::device_vec_t<vertex_t>&
       d_v_next_vertices,  // out: set of destination vertices, for next step
     original::device_vec_t<weight_t>&
-      d_v_next_weights)   // out: set of weights between src and destination vertices, for next step
+      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
     const
   {
     thrust::transform_if(
       handle_.get_thrust_policy(),
       thrust::make_counting_iterator<index_t>(0),
-      thrust::make_counting_iterator<index_t>(num_paths_),                         // input1
-      d_v_col_indx.begin(),                                                        // input2
-      out_degs_,                                                                   // stencil
+      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
+      d_v_col_indx.begin(),                                 // input2
+      out_degs_,                                            // stencil
       thrust::make_zip_iterator(
         thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
       [max_depth         = max_depth_,
@@ -575,9 +575,9 @@ struct random_walker_t {
       d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
                        // to be used as stencil (don't scatter if 0)
     original::device_vec_t<index_t> const&
-      d_sizes,         // paths sizes used to provide delta in coalesced paths;
-                       // pre-condition: assumed as updated to reflect new vertex additions;
-                       // also, this is the number of _vertices_ in each path;
+      d_sizes,  // paths sizes used to provide delta in coalesced paths;
+                // pre-condition: assumed as updated to reflect new vertex additions;
+                // also, this is the number of _vertices_ in each path;
     // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
     index_t
       stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
@@ -762,7 +762,7 @@ random_walks_impl(
   // pre-allocate num_paths * max_depth;
   //
   original::device_vec_t<vertex_t> d_coalesced_v(num_paths * max_depth,
-                                                 stream);         // coalesced vertex set
+                                                 stream);  // coalesced vertex set
   original::device_vec_t<weight_t> d_coalesced_w(num_paths * (max_depth - 1),
                                                  stream);         // coalesced weight set
   original::device_vec_t<index_t> d_paths_sz(num_paths, stream);  // paths sizes
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index d7381ba71af..6bc19ff4fe1 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -86,7 +86,7 @@ struct find_unused_id_t {
     for (size_t i = worker_id; i < sorted_local_vertices.size() + size_t{1}; i += num_workers) {
       auto start = (i == size_t{0}) ? std::numeric_limits<vertex_t>::lowest()
                                     : sorted_local_vertices[i - size_t{1}];
-      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };            // now inclusive
+      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };  // now inclusive
       auto end = (i == sorted_local_vertices.size()) ? std::numeric_limits<vertex_t>::max()
                                                      : sorted_local_vertices[i];  // exclusive
       for (vertex_t v = start; v < end; ++v) {
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 0402184bd93..437071569bf 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -73,7 +73,7 @@ struct e_op_t {
       if (*(prev_visited_flags + packed_bool_offset(dst)) &
           packed_bool_mask(dst)) {  // check if unvisited in previous iterations
         push = false;
-      } else {                      // check if unvisited in this iteration as well
+      } else {  // check if unvisited in this iteration as well
         auto old = visited_flags.atomic_or(dst, true);
         push     = !old;
       }
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index b71fe5ddb5e..bca6471a5bb 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -157,8 +157,8 @@ class Tests_MGExtractTransformE
     // 1. create MG graph
 
     constexpr bool is_multi_gpu     = true;
-    constexpr bool renumber         = true;    // needs to be true for multi gpu case
-    constexpr bool store_transposed = false;   // needs to be false for using extract_transform_e
+    constexpr bool renumber         = true;   // needs to be true for multi gpu case
+    constexpr bool store_transposed = false;  // needs to be false for using extract_transform_e
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
diff --git a/cpp/tests/traversal/mg_sssp_test.cpp b/cpp/tests/traversal/mg_sssp_test.cpp
index b3e96981f96..ea0353c3743 100644
--- a/cpp/tests/traversal/mg_sssp_test.cpp
+++ b/cpp/tests/traversal/mg_sssp_test.cpp
@@ -214,7 +214,7 @@ class Tests_MGSSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, in
         auto h_sg_predecessors = cugraph::test::to_host(*handle_, d_sg_predecessors);
 
         auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end());
-        auto epsilon            = *max_weight_element* weight_t{1e-6};
+        auto epsilon            = *max_weight_element * weight_t{1e-6};
         auto nearly_equal       = [epsilon](auto lhs, auto rhs) {
           return std::fabs(lhs - rhs) < epsilon;
         };
diff --git a/cpp/tests/traversal/sssp_test.cpp b/cpp/tests/traversal/sssp_test.cpp
index 1e77efa11de..ab73d02c912 100644
--- a/cpp/tests/traversal/sssp_test.cpp
+++ b/cpp/tests/traversal/sssp_test.cpp
@@ -222,7 +222,7 @@ class Tests_SSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, inpu
       }
 
       auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
-      auto epsilon            = *max_weight_element* weight_t{1e-6};
+      auto epsilon            = *max_weight_element * weight_t{1e-6};
       auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
 
       ASSERT_TRUE(std::equal(h_reference_distances.begin(),
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 0eff8dedc8f..321a0536e02 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -547,7 +547,7 @@ mg_vertex_property_values_to_sg_vertex_property_values(
   std::optional<raft::device_span<vertex_t const>>
     sg_renumber_map,  // std::nullopt if the SG graph is not renumbered
   std::optional<raft::device_span<vertex_t const>>
-    mg_vertices,      // std::nullopt if the entire local vertex partition range is assumed
+    mg_vertices,  // std::nullopt if the entire local vertex partition range is assumed
   raft::device_span<value_t const> mg_values);
 
 template <typename type_t>

From d93032105ff92a70e28511471444dfcb2557da90 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Wed, 20 Sep 2023 14:19:56 -0700
Subject: [PATCH 09/22] MFG C++ code bug fix (#3865)

cugraph::sort_sampled_edgelist currently returns (label, hop) offsets of all zero.

This PR fixes this.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3865
---
 cpp/src/sampling/sampling_post_processing_impl.cuh  | 10 ++++++++--
 cpp/tests/sampling/sampling_post_processing_test.cu | 11 +++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index ff8da72ff35..0c397d91b20 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -1619,10 +1619,13 @@ renumber_and_sort_sampled_edgelist(
                  (*edgelist_label_hop_offsets).begin(),
                  (*edgelist_label_hop_offsets).end(),
                  size_t{0});
-    thrust::for_each(
+    // FIXME: the device lambda should be placed in cuda::proclaim_return_type<size_t>()
+    // once we update CCCL version to 2.x
+    thrust::transform(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
       thrust::make_counting_iterator(num_labels * num_hops),
+      (*edgelist_label_hop_offsets).begin(),
       [edgelist_label_offsets = edgelist_label_offsets
                                   ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
                                   : thrust::nullopt,
@@ -1743,10 +1746,13 @@ sort_sampled_edgelist(
                  (*edgelist_label_hop_offsets).begin(),
                  (*edgelist_label_hop_offsets).end(),
                  size_t{0});
-    thrust::for_each(
+    // FIXME: the device lambda should be placed in cuda::proclaim_return_type<size_t>()
+    // once we update CCCL version to 2.x
+    thrust::transform(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
       thrust::make_counting_iterator(num_labels * num_hops),
+      (*edgelist_label_hop_offsets).begin(),
       [edgelist_label_offsets = edgelist_label_offsets
                                   ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
                                   : thrust::nullopt,
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index 422fe953b20..e5267d75ac2 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -635,6 +635,12 @@ class Tests_SamplingPostProcessing
                                         (*renumbered_and_sorted_edgelist_label_hop_offsets).end()))
             << "Renumbered and sorted edge list (label,hop) offset array values should be "
                "non-decreasing.";
+
+          ASSERT_TRUE(
+            (*renumbered_and_sorted_edgelist_label_hop_offsets).back_element(handle.get_stream()) ==
+            renumbered_and_sorted_edgelist_srcs.size())
+            << "Renumbered and sorted edge list (label,hop) offset array's last element should "
+               "coincide with the number of edges.";
         }
 
         if (renumbered_and_sorted_renumber_map_label_offsets) {
@@ -1189,6 +1195,11 @@ class Tests_SamplingPostProcessing
                                         (*sorted_edgelist_label_hop_offsets).end()))
             << "Sorted edge list (label,hop) offset array values should be "
                "non-decreasing.";
+
+          ASSERT_TRUE((*sorted_edgelist_label_hop_offsets).back_element(handle.get_stream()) ==
+                      sorted_edgelist_srcs.size())
+            << "Sorted edge list (label,hop) offset array's last element should coincide with the "
+               "number of edges.";
         }
 
         for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {

From a53ab34b804af2865d2d210b801a759d2ca29bc6 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Thu, 21 Sep 2023 19:39:18 +0200
Subject: [PATCH 10/22] Refactor python code for similarity algos to use latest
 CAPI (#3828)

This PR
-  refactors python code for similarity algorithms (Jaccard, Sorensen, Overlap) to use latest CAPI
-  removes legacy cuda c/c++ code and python wrapper around legacy code
-  update CAPI tests
-  remove and update python tests

Closes #2546
Closes #2547
Closes #2548
Closes #2549
Closes #2749

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3828
---
 cpp/CMakeLists.txt                            |   2 -
 cpp/src/link_prediction/legacy/jaccard.cu     | 429 ------------------
 cpp/src/link_prediction/legacy/overlap.cu     | 425 -----------------
 cpp/tests/c_api/mg_similarity_test.c          |  51 ++-
 cpp/tests/c_api/similarity_test.c             |  57 +--
 python/cugraph/CMakeLists.txt                 |   1 -
 .../cugraph/community/induced_subgraph.py     |   9 +-
 .../cugraph/dask/link_prediction/jaccard.py   |   2 +-
 .../cugraph/dask/link_prediction/overlap.py   |   2 +-
 .../cugraph/dask/link_prediction/sorensen.py  |   2 +-
 .../cugraph/cugraph/experimental/__init__.py  |  32 +-
 .../experimental/link_prediction/__init__.py  |  13 -
 .../experimental/link_prediction/jaccard.py   | 255 -----------
 .../experimental/link_prediction/overlap.py   | 223 ---------
 .../experimental/link_prediction/sorensen.py  | 221 ---------
 .../cugraph/link_prediction/CMakeLists.txt    |  22 -
 .../cugraph/link_prediction/__init__.py       |  23 +-
 .../cugraph/link_prediction/jaccard.pxd       |  35 --
 .../cugraph/link_prediction/jaccard.py        | 208 ++++++---
 .../link_prediction/jaccard_wrapper.pyx       | 155 -------
 .../cugraph/link_prediction/overlap.pxd       |  35 --
 .../cugraph/link_prediction/overlap.py        | 212 +++++++--
 .../link_prediction/overlap_wrapper.pyx       | 142 ------
 .../cugraph/link_prediction/sorensen.py       | 223 ++++++---
 .../cugraph/link_prediction/wjaccard.py       | 100 ++--
 .../cugraph/link_prediction/woverlap.py       |  76 ++--
 .../cugraph/link_prediction/wsorensen.py      |  78 ++--
 .../cugraph/cugraph/sampling/random_walks.py  |   9 +-
 .../tests/link_prediction/test_jaccard.py     | 315 +++++++------
 .../tests/link_prediction/test_overlap.py     | 152 ++++---
 .../tests/link_prediction/test_sorensen.py    | 252 ++++++----
 .../tests/link_prediction/test_wjaccard.py    | 177 --------
 .../tests/link_prediction/test_woverlap.py    | 171 -------
 .../tests/link_prediction/test_wsorensen.py   | 181 --------
 python/pylibcugraph/pylibcugraph/__init__.py  |   7 +
 .../pylibcugraph/experimental/__init__.py     |  19 +-
 .../pylibcugraph/jaccard_coefficients.pyx     |  12 +-
 .../pylibcugraph/overlap_coefficients.pyx     |  10 +-
 .../pylibcugraph/sorensen_coefficients.pyx    |  10 +-
 39 files changed, 1129 insertions(+), 3219 deletions(-)
 delete mode 100644 cpp/src/link_prediction/legacy/jaccard.cu
 delete mode 100644 cpp/src/link_prediction/legacy/overlap.cu
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/__init__.py
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/jaccard.py
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/overlap.py
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/sorensen.py
 delete mode 100644 python/cugraph/cugraph/link_prediction/CMakeLists.txt
 delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard.pxd
 delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx
 delete mode 100644 python/cugraph/cugraph/link_prediction/overlap.pxd
 delete mode 100644 python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx
 delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
 delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
 delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a6c26ee3b91..0d7bd86075d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -196,8 +196,6 @@ set(CUGRAPH_SOURCES
     src/utilities/path_retrieval.cu
     src/structure/legacy/graph.cu
     src/linear_assignment/legacy/hungarian.cu
-    src/link_prediction/legacy/jaccard.cu
-    src/link_prediction/legacy/overlap.cu
     src/link_prediction/jaccard_sg.cu
     src/link_prediction/sorensen_sg.cu
     src/link_prediction/overlap_sg.cu
diff --git a/cpp/src/link_prediction/legacy/jaccard.cu b/cpp/src/link_prediction/legacy/jaccard.cu
deleted file mode 100644
index d0b240e3c77..00000000000
--- a/cpp/src/link_prediction/legacy/jaccard.cu
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cugraph/legacy/graph.hpp>
-#include <cugraph/utilities/error.hpp>
-#include <utilities/graph_utils.cuh>
-
-#include <rmm/device_vector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/fill.h>
-
-namespace cugraph {
-namespace detail {
-
-// Volume of neighboors (*weight_s)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_row_sum(
-  vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work)
-{
-  vertex_t row;
-  edge_t start, end, length;
-  weight_t sum;
-
-  for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) {
-    start  = csrPtr[row];
-    end    = csrPtr[row + 1];
-    length = end - start;
-
-    // compute row sums
-    if (weighted) {
-      sum = parallel_prefix_sum(length, csrInd + start, v);
-      if (threadIdx.x == 0) work[row] = sum;
-    } else {
-      work[row] = static_cast<weight_t>(length);
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_is(vertex_t n,
-                           edge_t const* csrPtr,
-                           vertex_t const* csrInd,
-                           weight_t const* v,
-                           weight_t* work,
-                           weight_t* weight_i,
-                           weight_t* weight_s)
-{
-  edge_t i, j, Ni, Nj;
-  vertex_t row, col;
-  vertex_t ref, cur, ref_col, cur_col, match;
-  weight_t ref_val;
-
-  for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
-    for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
-         j += gridDim.y * blockDim.y) {
-      col = csrInd[j];
-      // find which row has least elements (and call it reference row)
-      Ni  = csrPtr[row + 1] - csrPtr[row];
-      Nj  = csrPtr[col + 1] - csrPtr[col];
-      ref = (Ni < Nj) ? row : col;
-      cur = (Ni < Nj) ? col : row;
-
-      // compute new sum weights
-      weight_s[j] = work[row] + work[col];
-
-      // compute new intersection weights
-      // search for the element with the same column index in the reference row
-      for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-           i += gridDim.x * blockDim.x) {
-        match   = -1;
-        ref_col = csrInd[i];
-        if (weighted) {
-          ref_val = v[ref_col];
-        } else {
-          ref_val = 1.0;
-        }
-
-        // binary search (column indices are sorted within each row)
-        edge_t left  = csrPtr[cur];
-        edge_t right = csrPtr[cur + 1] - 1;
-        while (left <= right) {
-          edge_t middle = (left + right) >> 1;
-          cur_col       = csrInd[middle];
-          if (cur_col > ref_col) {
-            right = middle - 1;
-          } else if (cur_col < ref_col) {
-            left = middle + 1;
-          } else {
-            match = middle;
-            break;
-          }
-        }
-
-        // if the element with the same column index in the reference row has been found
-        if (match != -1) { atomicAdd(&weight_i[j], ref_val); }
-      }
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-// Using list of node pairs
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_is_pairs(edge_t num_pairs,
-                                 edge_t const* csrPtr,
-                                 vertex_t const* csrInd,
-                                 vertex_t const* first_pair,
-                                 vertex_t const* second_pair,
-                                 weight_t const* v,
-                                 weight_t* work,
-                                 weight_t* weight_i,
-                                 weight_t* weight_s)
-{
-  edge_t i, idx, Ni, Nj, match;
-  vertex_t row, col, ref, cur, ref_col, cur_col;
-  weight_t ref_val;
-
-  for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs;
-       idx += gridDim.z * blockDim.z) {
-    row = first_pair[idx];
-    col = second_pair[idx];
-
-    // find which row has least elements (and call it reference row)
-    Ni  = csrPtr[row + 1] - csrPtr[row];
-    Nj  = csrPtr[col + 1] - csrPtr[col];
-    ref = (Ni < Nj) ? row : col;
-    cur = (Ni < Nj) ? col : row;
-
-    // compute new sum weights
-    weight_s[idx] = work[row] + work[col];
-
-    // compute new intersection weights
-    // search for the element with the same column index in the reference row
-    for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-         i += gridDim.x * blockDim.x) {
-      match   = -1;
-      ref_col = csrInd[i];
-      if (weighted) {
-        ref_val = v[ref_col];
-      } else {
-        ref_val = 1.0;
-      }
-
-      // binary search (column indices are sorted within each row)
-      edge_t left  = csrPtr[cur];
-      edge_t right = csrPtr[cur + 1] - 1;
-      while (left <= right) {
-        edge_t middle = (left + right) >> 1;
-        cur_col       = csrInd[middle];
-        if (cur_col > ref_col) {
-          right = middle - 1;
-        } else if (cur_col < ref_col) {
-          left = middle + 1;
-        } else {
-          match = middle;
-          break;
-        }
-      }
-
-      // if the element with the same column index in the reference row has been found
-      if (match != -1) { atomicAdd(&weight_i[idx], ref_val); }
-    }
-  }
-}
-
-// Jaccard  weights (*weight)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_jw(edge_t e,
-                           weight_t const* weight_i,
-                           weight_t const* weight_s,
-                           weight_t* weight_j)
-{
-  edge_t j;
-  weight_t Wi, Ws, Wu;
-
-  for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) {
-    Wi          = weight_i[j];
-    Ws          = weight_s[j];
-    Wu          = Ws - Wi;
-    weight_j[j] = (Wi / Wu);
-  }
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int jaccard(vertex_t n,
-            edge_t e,
-            edge_t const* csrPtr,
-            vertex_t const* csrInd,
-            weight_t const* weight_in,
-            weight_t* work,
-            weight_t* weight_i,
-            weight_t* weight_s,
-            weight_t* weight_j)
-{
-  rmm::cuda_stream_view stream_view;
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, csrPtr, csrInd, weight_in, work);
-
-  thrust::fill(rmm::exec_policy(stream_view), weight_i, weight_i + e, weight_t{0.0});
-
-  // setup launch configuration
-  nthreads.x = 32 / y;
-  nthreads.y = y;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  jaccard_is<weighted, vertex_t, edge_t, weight_t><<<nblocks, nthreads, 0, stream_view.value()>>>(
-    n, csrPtr, csrInd, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS});
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(e, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int jaccard_pairs(vertex_t n,
-                  edge_t num_pairs,
-                  edge_t const* csrPtr,
-                  vertex_t const* csrInd,
-                  vertex_t const* first_pair,
-                  vertex_t const* second_pair,
-                  weight_t const* weight_in,
-                  weight_t* work,
-                  weight_t* weight_i,
-                  weight_t* weight_s,
-                  weight_t* weight_j)
-{
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work);
-  cudaDeviceSynchronize();
-
-  // NOTE: initilized weight_i vector with 0.0
-  // fill(num_pairs, weight_i, weight_t{0.0});
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = 1;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  jaccard_is_pairs<weighted, vertex_t, edge_t, weight_t><<<nblocks, nthreads>>>(
-    num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((num_pairs + nthreads.x - 1) / nthreads.x, (edge_t)CUDA_MAX_BLOCKS);
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(num_pairs, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-}  // namespace detail
-
-template <typename VT, typename ET, typename WT>
-void jaccard(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-
-  rmm::device_vector<WT> weight_i(graph.number_of_edges);
-  rmm::device_vector<WT> weight_s(graph.number_of_edges);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::jaccard<false, VT, ET, WT>(graph.number_of_vertices,
-                                                graph.number_of_edges,
-                                                graph.offsets,
-                                                graph.indices,
-                                                weights,
-                                                work.data().get(),
-                                                weight_i.data().get(),
-                                                weight_s.data().get(),
-                                                result);
-  } else {
-    cugraph::detail::jaccard<true, VT, ET, WT>(graph.number_of_vertices,
-                                               graph.number_of_edges,
-                                               graph.offsets,
-                                               graph.indices,
-                                               weights,
-                                               work.data().get(),
-                                               weight_i.data().get(),
-                                               weight_s.data().get(),
-                                               result);
-  }
-}
-
-template <typename VT, typename ET, typename WT>
-void jaccard_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
-                  WT const* weights,
-                  ET num_pairs,
-                  VT const* first,
-                  VT const* second,
-                  WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second in NULL");
-
-  rmm::device_vector<WT> weight_i(num_pairs, WT{0.0});
-  rmm::device_vector<WT> weight_s(num_pairs);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::jaccard_pairs<false, VT, ET, WT>(graph.number_of_vertices,
-                                                      num_pairs,
-                                                      graph.offsets,
-                                                      graph.indices,
-                                                      first,
-                                                      second,
-                                                      weights,
-                                                      work.data().get(),
-                                                      weight_i.data().get(),
-                                                      weight_s.data().get(),
-                                                      result);
-  } else {
-    cugraph::detail::jaccard_pairs<true, VT, ET, WT>(graph.number_of_vertices,
-                                                     num_pairs,
-                                                     graph.offsets,
-                                                     graph.indices,
-                                                     first,
-                                                     second,
-                                                     weights,
-                                                     work.data().get(),
-                                                     weight_i.data().get(),
-                                                     weight_s.data().get(),
-                                                     result);
-  }
-}
-
-template void jaccard<int32_t, int32_t, float>(legacy::GraphCSRView<int32_t, int32_t, float> const&,
-                                               float const*,
-                                               float*);
-template void jaccard<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&, double const*, double*);
-template void jaccard<int64_t, int64_t, float>(legacy::GraphCSRView<int64_t, int64_t, float> const&,
-                                               float const*,
-                                               float*);
-template void jaccard<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&, double const*, double*);
-template void jaccard_list<int32_t, int32_t, float>(
-  legacy::GraphCSRView<int32_t, int32_t, float> const&,
-  float const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  float*);
-template void jaccard_list<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&,
-  double const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  double*);
-template void jaccard_list<int64_t, int64_t, float>(
-  legacy::GraphCSRView<int64_t, int64_t, float> const&,
-  float const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  float*);
-template void jaccard_list<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&,
-  double const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  double*);
-
-}  // namespace cugraph
diff --git a/cpp/src/link_prediction/legacy/overlap.cu b/cpp/src/link_prediction/legacy/overlap.cu
deleted file mode 100644
index 67d7cd5e4c6..00000000000
--- a/cpp/src/link_prediction/legacy/overlap.cu
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cugraph/legacy/graph.hpp>
-#include <cugraph/utilities/error.hpp>
-#include <rmm/device_vector.hpp>
-#include <utilities/graph_utils.cuh>
-
-namespace cugraph {
-namespace detail {
-
-// Volume of neighboors (*weight_s)
-// TODO: Identical kernel to jaccard_row_sum!!
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_row_sum(
-  vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work)
-{
-  vertex_t row;
-  edge_t start, end, length;
-  weight_t sum;
-
-  for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) {
-    start  = csrPtr[row];
-    end    = csrPtr[row + 1];
-    length = end - start;
-
-    // compute row sums
-    if (weighted) {
-      sum = parallel_prefix_sum(length, csrInd + start, v);
-      if (threadIdx.x == 0) work[row] = sum;
-    } else {
-      work[row] = static_cast<weight_t>(length);
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-// TODO: Identical kernel to jaccard_row_sum!!
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_is(vertex_t n,
-                           edge_t const* csrPtr,
-                           vertex_t const* csrInd,
-                           weight_t const* v,
-                           weight_t* work,
-                           weight_t* weight_i,
-                           weight_t* weight_s)
-{
-  edge_t i, j, Ni, Nj;
-  vertex_t row, col;
-  vertex_t ref, cur, ref_col, cur_col, match;
-  weight_t ref_val;
-
-  for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
-    for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
-         j += gridDim.y * blockDim.y) {
-      col = csrInd[j];
-      // find which row has least elements (and call it reference row)
-      Ni  = csrPtr[row + 1] - csrPtr[row];
-      Nj  = csrPtr[col + 1] - csrPtr[col];
-      ref = (Ni < Nj) ? row : col;
-      cur = (Ni < Nj) ? col : row;
-
-      // compute new sum weights
-      weight_s[j] = min(work[row], work[col]);
-
-      // compute new intersection weights
-      // search for the element with the same column index in the reference row
-      for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-           i += gridDim.x * blockDim.x) {
-        match   = -1;
-        ref_col = csrInd[i];
-        if (weighted) {
-          ref_val = v[ref_col];
-        } else {
-          ref_val = 1.0;
-        }
-
-        // binary search (column indices are sorted within each row)
-        edge_t left  = csrPtr[cur];
-        edge_t right = csrPtr[cur + 1] - 1;
-        while (left <= right) {
-          edge_t middle = (left + right) >> 1;
-          cur_col       = csrInd[middle];
-          if (cur_col > ref_col) {
-            right = middle - 1;
-          } else if (cur_col < ref_col) {
-            left = middle + 1;
-          } else {
-            match = middle;
-            break;
-          }
-        }
-
-        // if the element with the same column index in the reference row has been found
-        if (match != -1) { atomicAdd(&weight_i[j], ref_val); }
-      }
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-// Using list of node pairs
-// NOTE:  NOT the same as jaccard
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_is_pairs(edge_t num_pairs,
-                                 edge_t const* csrPtr,
-                                 vertex_t const* csrInd,
-                                 vertex_t const* first_pair,
-                                 vertex_t const* second_pair,
-                                 weight_t const* v,
-                                 weight_t* work,
-                                 weight_t* weight_i,
-                                 weight_t* weight_s)
-{
-  edge_t i, idx, Ni, Nj, match;
-  vertex_t row, col, ref, cur, ref_col, cur_col;
-  weight_t ref_val;
-
-  for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs;
-       idx += gridDim.z * blockDim.z) {
-    row = first_pair[idx];
-    col = second_pair[idx];
-
-    // find which row has least elements (and call it reference row)
-    Ni  = csrPtr[row + 1] - csrPtr[row];
-    Nj  = csrPtr[col + 1] - csrPtr[col];
-    ref = (Ni < Nj) ? row : col;
-    cur = (Ni < Nj) ? col : row;
-
-    // compute new sum weights
-    weight_s[idx] = min(work[row], work[col]);
-
-    // compute new intersection weights
-    // search for the element with the same column index in the reference row
-    for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-         i += gridDim.x * blockDim.x) {
-      match   = -1;
-      ref_col = csrInd[i];
-      if (weighted) {
-        ref_val = v[ref_col];
-      } else {
-        ref_val = 1.0;
-      }
-
-      // binary search (column indices are sorted within each row)
-      edge_t left  = csrPtr[cur];
-      edge_t right = csrPtr[cur + 1] - 1;
-      while (left <= right) {
-        edge_t middle = (left + right) >> 1;
-        cur_col       = csrInd[middle];
-        if (cur_col > ref_col) {
-          right = middle - 1;
-        } else if (cur_col < ref_col) {
-          left = middle + 1;
-        } else {
-          match = middle;
-          break;
-        }
-      }
-
-      // if the element with the same column index in the reference row has been found
-      if (match != -1) { atomicAdd(&weight_i[idx], ref_val); }
-    }
-  }
-}
-
-// Overlap  weights (*weight)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_jw(edge_t e,
-                           edge_t const* csrPtr,
-                           vertex_t const* csrInd,
-                           weight_t* weight_i,
-                           weight_t* weight_s,
-                           weight_t* weight_j)
-{
-  edge_t j;
-  weight_t Wi, Wu;
-
-  for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) {
-    Wi          = weight_i[j];
-    Wu          = weight_s[j];
-    weight_j[j] = (Wi / Wu);
-  }
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int overlap(vertex_t n,
-            edge_t e,
-            edge_t const* csrPtr,
-            vertex_t const* csrInd,
-            weight_t const* weight_in,
-            weight_t* work,
-            weight_t* weight_i,
-            weight_t* weight_s,
-            weight_t* weight_j)
-{
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-
-  // launch kernel
-  overlap_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work);
-  cudaDeviceSynchronize();
-  fill(e, weight_i, weight_t{0.0});
-
-  // setup launch configuration
-  nthreads.x = 32 / y;
-  nthreads.y = y;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  overlap_is<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS});
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-
-  // launch kernel
-  overlap_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(e, csrPtr, csrInd, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int overlap_pairs(vertex_t n,
-                  edge_t num_pairs,
-                  edge_t const* csrPtr,
-                  vertex_t const* csrInd,
-                  vertex_t const* first_pair,
-                  vertex_t const* second_pair,
-                  weight_t const* weight_in,
-                  weight_t* work,
-                  weight_t* weight_i,
-                  weight_t* weight_s,
-                  weight_t* weight_j)
-{
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-  // launch kernel
-
-  overlap_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work);
-  cudaDeviceSynchronize();
-  fill(num_pairs, weight_i, weight_t{0.0});
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = 1;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  overlap_is_pairs<weighted, vertex_t, edge_t, weight_t><<<nblocks, nthreads>>>(
-    num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((num_pairs + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS});
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-  // launch kernel
-
-  overlap_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(num_pairs, csrPtr, csrInd, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-}  // namespace detail
-
-template <typename VT, typename ET, typename WT>
-void overlap(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-
-  rmm::device_vector<WT> weight_i(graph.number_of_edges);
-  rmm::device_vector<WT> weight_s(graph.number_of_edges);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::overlap<false, VT, ET, WT>(graph.number_of_vertices,
-                                                graph.number_of_edges,
-                                                graph.offsets,
-                                                graph.indices,
-                                                weights,
-                                                work.data().get(),
-                                                weight_i.data().get(),
-                                                weight_s.data().get(),
-                                                result);
-  } else {
-    cugraph::detail::overlap<true, VT, ET, WT>(graph.number_of_vertices,
-                                               graph.number_of_edges,
-                                               graph.offsets,
-                                               graph.indices,
-                                               weights,
-                                               work.data().get(),
-                                               weight_i.data().get(),
-                                               weight_s.data().get(),
-                                               result);
-  }
-}
-
-template <typename VT, typename ET, typename WT>
-void overlap_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
-                  WT const* weights,
-                  ET num_pairs,
-                  VT const* first,
-                  VT const* second,
-                  WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first column is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second column is NULL");
-
-  rmm::device_vector<WT> weight_i(num_pairs);
-  rmm::device_vector<WT> weight_s(num_pairs);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::overlap_pairs<false, VT, ET, WT>(graph.number_of_vertices,
-                                                      num_pairs,
-                                                      graph.offsets,
-                                                      graph.indices,
-                                                      first,
-                                                      second,
-                                                      weights,
-                                                      work.data().get(),
-                                                      weight_i.data().get(),
-                                                      weight_s.data().get(),
-                                                      result);
-  } else {
-    cugraph::detail::overlap_pairs<true, VT, ET, WT>(graph.number_of_vertices,
-                                                     num_pairs,
-                                                     graph.offsets,
-                                                     graph.indices,
-                                                     first,
-                                                     second,
-                                                     weights,
-                                                     work.data().get(),
-                                                     weight_i.data().get(),
-                                                     weight_s.data().get(),
-                                                     result);
-  }
-}
-
-template void overlap<int32_t, int32_t, float>(legacy::GraphCSRView<int32_t, int32_t, float> const&,
-                                               float const*,
-                                               float*);
-template void overlap<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&, double const*, double*);
-template void overlap<int64_t, int64_t, float>(legacy::GraphCSRView<int64_t, int64_t, float> const&,
-                                               float const*,
-                                               float*);
-template void overlap<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&, double const*, double*);
-template void overlap_list<int32_t, int32_t, float>(
-  legacy::GraphCSRView<int32_t, int32_t, float> const&,
-  float const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  float*);
-template void overlap_list<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&,
-  double const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  double*);
-template void overlap_list<int64_t, int64_t, float>(
-  legacy::GraphCSRView<int64_t, int64_t, float> const&,
-  float const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  float*);
-template void overlap_list<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&,
-  double const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  double*);
-
-}  // namespace cugraph
diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c
index 0ac160245ab..336f6c50519 100644
--- a/cpp/tests/c_api/mg_similarity_test.c
+++ b/cpp/tests/c_api/mg_similarity_test.c
@@ -160,15 +160,16 @@ int test_jaccard(const cugraph_resource_handle_t* handle)
 int test_weighted_jaccard(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.357143, 0.208333, 0.0};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -216,15 +217,16 @@ int test_sorensen(const cugraph_resource_handle_t* handle)
 int test_weighted_sorensen(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.526316, 0.344828, 0.000000};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -272,15 +274,16 @@ int test_overlap(const cugraph_resource_handle_t* handle)
 int test_weighted_overlap(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.714286, 0.416667, 0.000000};
 
   return generic_similarity_test(handle,
                                  h_src,
diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c
index 20af3f3eccd..52f849ccd28 100644
--- a/cpp/tests/c_api/similarity_test.c
+++ b/cpp/tests/c_api/similarity_test.c
@@ -161,15 +161,16 @@ int test_jaccard()
 int test_weighted_jaccard()
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.357143, 0.208333, 0.0};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -215,15 +216,16 @@ int test_sorensen()
 int test_weighted_sorensen()
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.526316, 0.344828, 0.000000};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -269,15 +271,16 @@ int test_overlap()
 int test_weighted_overlap()
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.714286, 0.416667, 0.000000};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -301,8 +304,8 @@ int main(int argc, char** argv)
   result |= RUN_TEST(test_jaccard);
   result |= RUN_TEST(test_sorensen);
   result |= RUN_TEST(test_overlap);
-  // result |= RUN_TEST(test_weighted_jaccard);
-  // result |= RUN_TEST(test_weighted_sorensen);
-  // result |= RUN_TEST(test_weighted_overlap);
+  result |= RUN_TEST(test_weighted_jaccard);
+  result |= RUN_TEST(test_weighted_sorensen);
+  result |= RUN_TEST(test_weighted_overlap);
   return result;
 }
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index f3b28623b12..ecfcb9b219f 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -89,7 +89,6 @@ add_subdirectory(cugraph/dask/structure)
 add_subdirectory(cugraph/internals)
 add_subdirectory(cugraph/layout)
 add_subdirectory(cugraph/linear_assignment)
-add_subdirectory(cugraph/link_prediction)
 add_subdirectory(cugraph/structure)
 add_subdirectory(cugraph/tree)
 add_subdirectory(cugraph/utilities)
diff --git a/python/cugraph/cugraph/community/induced_subgraph.py b/python/cugraph/cugraph/community/induced_subgraph.py
index 29fe2f29c1e..3a901199b01 100644
--- a/python/cugraph/cugraph/community/induced_subgraph.py
+++ b/python/cugraph/cugraph/community/induced_subgraph.py
@@ -25,11 +25,10 @@
 )
 from cugraph.utilities.utils import import_optional
 
-# FIXME: the networkx.Graph type used in the type annotation for
-# induced_subgraph() is specified using a string literal to avoid depending on
-# and importing networkx. Instead, networkx is imported optionally, which may
-# cause a problem for a type checker if run in an environment where networkx is
-# not installed.
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
 networkx = import_optional("networkx")
 
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
index b3d688584a0..218e6206fc3 100644
--- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -20,7 +20,7 @@
 from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.utilities import renumber_vertex_pair
 
-from pylibcugraph.experimental import (
+from pylibcugraph import (
     jaccard_coefficients as pylibcugraph_jaccard_coefficients,
 )
 from pylibcugraph import ResourceHandle
diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py
index c47aeef3c72..5540be28fd1 100644
--- a/python/cugraph/cugraph/dask/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py
@@ -20,7 +20,7 @@
 from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.utilities import renumber_vertex_pair
 
-from pylibcugraph.experimental import (
+from pylibcugraph import (
     overlap_coefficients as pylibcugraph_overlap_coefficients,
 )
 from pylibcugraph import ResourceHandle
diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
index bb5a3f44f39..24295ac330c 100644
--- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
@@ -20,7 +20,7 @@
 from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.utilities import renumber_vertex_pair
 
-from pylibcugraph.experimental import (
+from pylibcugraph import (
     sorensen_coefficients as pylibcugraph_sorensen_coefficients,
 )
 from pylibcugraph import ResourceHandle
diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py
index b96b760e634..2309a529047 100644
--- a/python/cugraph/cugraph/experimental/__init__.py
+++ b/python/cugraph/cugraph/experimental/__init__.py
@@ -48,30 +48,22 @@
     experimental_warning_wrapper(EXPERIMENTAL__find_bicliques)
 )
 
-from cugraph.experimental.link_prediction.jaccard import (
-    EXPERIMENTAL__jaccard,
-    EXPERIMENTAL__jaccard_coefficient,
-)
+from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
 
-jaccard = experimental_warning_wrapper(EXPERIMENTAL__jaccard)
-jaccard_coefficient = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficient)
+BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
 
-from cugraph.experimental.link_prediction.sorensen import (
-    EXPERIMENTAL__sorensen,
-    EXPERIMENTAL__sorensen_coefficient,
-)
 
-sorensen = experimental_warning_wrapper(EXPERIMENTAL__sorensen)
-sorensen_coefficient = experimental_warning_wrapper(EXPERIMENTAL__sorensen_coefficient)
+from cugraph.link_prediction.jaccard import jaccard, jaccard_coefficient
 
-from cugraph.experimental.link_prediction.overlap import (
-    EXPERIMENTAL__overlap,
-    EXPERIMENTAL__overlap_coefficient,
-)
+jaccard = promoted_experimental_warning_wrapper(jaccard)
+jaccard_coefficient = promoted_experimental_warning_wrapper(jaccard_coefficient)
 
-overlap = experimental_warning_wrapper(EXPERIMENTAL__overlap)
-overlap_coefficient = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficient)
+from cugraph.link_prediction.sorensen import sorensen, sorensen_coefficient
 
-from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
+sorensen = promoted_experimental_warning_wrapper(sorensen)
+sorensen_coefficient = promoted_experimental_warning_wrapper(sorensen_coefficient)
 
-BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
+from cugraph.link_prediction.overlap import overlap, overlap_coefficient
+
+overlap = promoted_experimental_warning_wrapper(overlap)
+overlap_coefficient = promoted_experimental_warning_wrapper(overlap_coefficient)
diff --git a/python/cugraph/cugraph/experimental/link_prediction/__init__.py b/python/cugraph/cugraph/experimental/link_prediction/__init__.py
deleted file mode 100644
index 081b2ae8260..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py b/python/cugraph/cugraph/experimental/link_prediction/jaccard.py
deleted file mode 100644
index 2eba73b3824..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities import (
-    ensure_cugraph_obj_for_nx,
-    df_edge_score_to_dictionary,
-    renumber_vertex_pair,
-)
-import cudf
-import warnings
-
-from pylibcugraph.experimental import (
-    jaccard_coefficients as pylibcugraph_jaccard_coefficients,
-)
-from pylibcugraph import ResourceHandle
-
-
-# FIXME: Move this function to the utility module so that it can be
-# shared by other algos
-def ensure_valid_dtype(input_graph, vertex_pair):
-
-    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
-    vertex_pair_dtypes = vertex_pair.dtypes
-
-    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
-        warning_msg = (
-            "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. "
-            f"input graph's vertex type is: {vertex_dtype} and got "
-            f"'vertex_pair' of type: {vertex_pair_dtypes}."
-        )
-        warnings.warn(warning_msg, UserWarning)
-        vertex_pair = vertex_pair.astype(vertex_dtype)
-
-    return vertex_pair
-
-
-def EXPERIMENTAL__jaccard(G, vertex_pair=None, use_weight=False):
-    """
-    Compute the Jaccard similarity between each pair of vertices connected by
-    an edge, or between arbitrary pairs of vertices specified by the user.
-    Jaccard similarity is defined between two sets as the ratio of the volume
-    of their intersection divided by the volume of their union. In the context
-    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
-    similarity weight of each edge represents the strength of connection
-    between vertices based on the relative similarity of their neighbors. If
-    first is specified but second is not, or vice versa, an exception will be
-    thrown.
-
-    NOTE: If the vertex_pair parameter is not specified then the behavior
-    of cugraph.jaccard is different from the behavior of
-    networkx.jaccard_coefficient.
-
-    cugraph.jaccard, in the absence of a specified vertex pair list, will
-    compute the two_hop_neighbors of the entire graph to construct a vertex pair
-    list and will return the jaccard coefficient for those vertex pairs. This is
-    not advisable as the vertex_pairs can grow exponentially with respect to the
-    size of the datasets
-
-    networkx.jaccard_coefficient, in the absence of a specified vertex
-    pair list, will return an upper triangular dense matrix, excluding
-    the diagonal as well as vertex pairs that are directly connected
-    by an edge in the graph, of jaccard coefficients.  Technically, networkx
-    returns a lazy iterator across this upper triangular matrix where
-    the actual jaccard coefficient is computed when the iterator is
-    dereferenced.  Computing a dense matrix of results is not feasible
-    if the number of vertices in the graph is large (100,000 vertices
-    would result in 4.9 billion values in that iterator).
-
-    If your graph is small enough (or you have enough memory and patience)
-    you can get the interesting (non-zero) values that are part of the networkx
-    solution by doing the following:
-
-    >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> pairs = G.get_two_hop_neighbors()
-    >>> df = cugraph.jaccard(G, pairs)
-
-    But please remember that cugraph will fill the dataframe with the entire
-    solution you request, so you'll need enough memory to store the 2-hop
-    neighborhood dataframe.
-
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-        This implementation only supports undirected, unweighted Graph.
-
-    vertex_pair : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the jaccard coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the jaccard coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Jaccard weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['jaccard_coeff'] : cudf.Series
-            The computed jaccard coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import jaccard as exp_jaccard
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_jaccard(G)
-
-    """
-    if G.is_directed():
-        raise ValueError("Input must be an undirected Graph.")
-
-    if G.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if vertex_pair is None:
-        # Call two_hop neighbor of the entire graph
-        vertex_pair = G.get_two_hop_neighbors()
-
-    v_p_num_col = len(vertex_pair.columns)
-
-    if isinstance(vertex_pair, cudf.DataFrame):
-        vertex_pair = renumber_vertex_pair(G, vertex_pair)
-        vertex_pair = ensure_valid_dtype(G, vertex_pair)
-        src_col_name = vertex_pair.columns[0]
-        dst_col_name = vertex_pair.columns[1]
-        first = vertex_pair[src_col_name]
-        second = vertex_pair[dst_col_name]
-
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    use_weight = False
-    first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients(
-        resource_handle=ResourceHandle(),
-        graph=G._plc_graph,
-        first=first,
-        second=second,
-        use_weight=use_weight,
-        do_expensive_check=False,
-    )
-
-    if G.renumbered:
-        vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True)
-        vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True)
-
-    if v_p_num_col == 2:
-        # single column vertex
-        vertex_pair = vertex_pair.rename(
-            columns={src_col_name: "first", dst_col_name: "second"}
-        )
-
-    df = vertex_pair
-    df["jaccard_coeff"] = cudf.Series(jaccard_coeff)
-
-    return df
-
-
-def EXPERIMENTAL__jaccard_coefficient(G, ebunch=None, use_weight=False):
-    """
-    For NetworkX Compatability.  See `jaccard`
-
-    Parameters
-    ----------
-    graph : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-    ebunch : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the jaccard coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the jaccard coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Jaccard weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        ddf['first']: dask_cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        ddf['second']: dask_cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        ddf['jaccard_coeff']: dask_cudf.Series
-            The computed jaccard coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_jaccard_coefficient(G)
-
-    """
-    vertex_pair = None
-
-    G, isNx = ensure_cugraph_obj_for_nx(G)
-
-    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
-    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
-    if isNx is True and ebunch is not None:
-        vertex_pair = cudf.DataFrame(ebunch)
-
-    df = EXPERIMENTAL__jaccard(G, vertex_pair)
-
-    if isNx is True:
-        df = df_edge_score_to_dictionary(
-            df, k="jaccard_coeff", src="first", dst="second"
-        )
-
-    return df
diff --git a/python/cugraph/cugraph/experimental/link_prediction/overlap.py b/python/cugraph/cugraph/experimental/link_prediction/overlap.py
deleted file mode 100644
index 0981ced4835..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/overlap.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities import (
-    ensure_cugraph_obj_for_nx,
-    df_edge_score_to_dictionary,
-    renumber_vertex_pair,
-)
-import cudf
-import warnings
-
-from pylibcugraph.experimental import (
-    overlap_coefficients as pylibcugraph_overlap_coefficients,
-)
-from pylibcugraph import ResourceHandle
-
-
-# FIXME: Move this function to the utility module so that it can be
-# shared by other algos
-def ensure_valid_dtype(input_graph, vertex_pair):
-
-    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
-    vertex_pair_dtypes = vertex_pair.dtypes
-
-    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
-        warning_msg = (
-            "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. "
-            f"input graph's vertex type is: {vertex_dtype} and got "
-            f"'vertex_pair' of type: {vertex_pair_dtypes}."
-        )
-        warnings.warn(warning_msg, UserWarning)
-        vertex_pair = vertex_pair.astype(vertex_dtype)
-
-    return vertex_pair
-
-
-def EXPERIMENTAL__overlap_coefficient(G, ebunch=None, use_weight=False):
-    """
-    For NetworkX Compatability.  See `overlap`
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-    ebunch : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the Overlap coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the overlap coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the overlap weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        ddf['first']: dask_cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        ddf['second']: dask_cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        ddf['overlap_coeff']: dask_cudf.Series
-            The computed overlap coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import overlap_coefficient as exp_overlap_coefficient
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_overlap_coefficient(G)
-    """
-    vertex_pair = None
-
-    G, isNx = ensure_cugraph_obj_for_nx(G)
-
-    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
-    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
-    if isNx is True and ebunch is not None:
-        vertex_pair = cudf.DataFrame(ebunch)
-
-    df = EXPERIMENTAL__overlap(G, vertex_pair)
-
-    if isNx is True:
-        df = df_edge_score_to_dictionary(
-            df, k="overlap_coeff", src="first", dst="second"
-        )
-
-    return df
-
-
-def EXPERIMENTAL__overlap(G, vertex_pair=None, use_weight=False):
-    """
-    Compute the Overlap Coefficient between each pair of vertices connected by
-    an edge, or between arbitrary pairs of vertices specified by the user.
-    Overlap Coefficient is defined between two sets as the ratio of the volume
-    of their intersection divided by the smaller of their two volumes. In the
-    context of graphs, the neighborhood of a vertex is seen as a set. The
-    Overlap Coefficient weight of each edge represents the strength of
-    connection between vertices based on the relative similarity of their
-    neighbors. If first is specified but second is not, or vice versa, an
-    exception will be thrown.
-
-    cugraph.overlap, in the absence of a specified vertex pair list, will
-    compute the two_hop_neighbors of the entire graph to construct a vertex pair
-    list and will return the overlap coefficient for those vertex pairs. This is
-    not advisable as the vertex_pairs can grow exponentially with respect to the
-    size of the datasets
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        adjacency list will be computed if not already present.
-
-        This implementation only supports undirected, unweighted Graph.
-
-    vertex_pair : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the overlap coefficient is computed for the
-        given vertex pairs, else, it is computed for all vertex pairs.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Overlap coefficients. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['overlap_coeff'] : cudf.Series
-            The computed overlap coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import overlap as exp_overlap
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_overlap(G)
-
-    """
-
-    if G.is_directed():
-        raise ValueError("Input must be an undirected Graph.")
-
-    if G.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if vertex_pair is None:
-        # Call two_hop neighbor of the entire graph
-        vertex_pair = G.get_two_hop_neighbors()
-
-    v_p_num_col = len(vertex_pair.columns)
-
-    if isinstance(vertex_pair, cudf.DataFrame):
-        vertex_pair = renumber_vertex_pair(G, vertex_pair)
-        vertex_pair = ensure_valid_dtype(G, vertex_pair)
-        src_col_name = vertex_pair.columns[0]
-        dst_col_name = vertex_pair.columns[1]
-        first = vertex_pair[src_col_name]
-        second = vertex_pair[dst_col_name]
-
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    use_weight = False
-    first, second, overlap_coeff = pylibcugraph_overlap_coefficients(
-        resource_handle=ResourceHandle(),
-        graph=G._plc_graph,
-        first=first,
-        second=second,
-        use_weight=use_weight,
-        do_expensive_check=False,
-    )
-
-    if G.renumbered:
-        vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True)
-        vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True)
-
-    if v_p_num_col == 2:
-        # single column vertex
-        vertex_pair = vertex_pair.rename(
-            columns={src_col_name: "first", dst_col_name: "second"}
-        )
-
-    df = vertex_pair
-    df["overlap_coeff"] = cudf.Series(overlap_coeff)
-
-    return df
diff --git a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py b/python/cugraph/cugraph/experimental/link_prediction/sorensen.py
deleted file mode 100644
index ed27e4813d3..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities import (
-    ensure_cugraph_obj_for_nx,
-    df_edge_score_to_dictionary,
-    renumber_vertex_pair,
-)
-import cudf
-import warnings
-from pylibcugraph.experimental import (
-    sorensen_coefficients as pylibcugraph_sorensen_coefficients,
-)
-from pylibcugraph import ResourceHandle
-
-
-# FIXME: Move this function to the utility module so that it can be
-# shared by other algos
-def ensure_valid_dtype(input_graph, vertex_pair):
-
-    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
-    vertex_pair_dtypes = vertex_pair.dtypes
-
-    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
-        warning_msg = (
-            "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. "
-            f"input graph's vertex type is: {vertex_dtype} and got "
-            f"'vertex_pair' of type: {vertex_pair_dtypes}."
-        )
-        warnings.warn(warning_msg, UserWarning)
-        vertex_pair = vertex_pair.astype(vertex_dtype)
-
-    return vertex_pair
-
-
-def EXPERIMENTAL__sorensen(G, vertex_pair=None, use_weight=False):
-    """
-    Compute the Sorensen coefficient between each pair of vertices connected by
-    an edge, or between arbitrary pairs of vertices specified by the user.
-    Sorensen coefficient is defined between two sets as the ratio of twice the
-    volume of their intersection divided by the volume of each set.
-    If first is specified but second is not, or vice versa, an exception will
-    be thrown.
-
-    cugraph.sorensen, in the absence of a specified vertex pair list, will
-    compute the two_hop_neighbors of the entire graph to construct a vertex pair
-    list and will return the sorensen coefficient for those vertex pairs. This is
-    not advisable as the vertex_pairs can grow exponentially with respect to the
-    size of the datasets
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-        This implementation only supports undirected, unweighted Graph.
-
-    vertex_pair : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the Sorensen coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the Sorensen coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Sorensen index. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['sorensen_coeff'] : cudf.Series
-            The computed sorensen coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import sorensen as exp_sorensen
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_sorensen(G)
-
-    """
-    if G.is_directed():
-        raise ValueError("Input must be an undirected Graph.")
-
-    if G.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if vertex_pair is None:
-        # Call two_hop neighbor of the entire graph
-        vertex_pair = G.get_two_hop_neighbors()
-
-    v_p_num_col = len(vertex_pair.columns)
-
-    if isinstance(vertex_pair, cudf.DataFrame):
-        vertex_pair = renumber_vertex_pair(G, vertex_pair)
-        vertex_pair = ensure_valid_dtype(G, vertex_pair)
-        src_col_name = vertex_pair.columns[0]
-        dst_col_name = vertex_pair.columns[1]
-        first = vertex_pair[src_col_name]
-        second = vertex_pair[dst_col_name]
-
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    use_weight = False
-    first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients(
-        resource_handle=ResourceHandle(),
-        graph=G._plc_graph,
-        first=first,
-        second=second,
-        use_weight=use_weight,
-        do_expensive_check=False,
-    )
-
-    if G.renumbered:
-        vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True)
-        vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True)
-
-    if v_p_num_col == 2:
-        # single column vertex
-        vertex_pair = vertex_pair.rename(
-            columns={src_col_name: "first", dst_col_name: "second"}
-        )
-
-    df = vertex_pair
-    df["sorensen_coeff"] = cudf.Series(sorensen_coeff)
-
-    return df
-
-
-def EXPERIMENTAL__sorensen_coefficient(G, ebunch=None, use_weight=False):
-    """
-    For NetworkX Compatability.  See `sorensen`
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-    ebunch : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the sorensen coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the sorensen coefficient for all
-        adjacent vertices in the graph.
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Sorensen weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['sorensen_coeff'] : cudf.Series
-            The computed sorensen coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import sorensen_coefficient as exp_sorensen_coef
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_sorensen_coef(G)
-
-    """
-    vertex_pair = None
-
-    G, isNx = ensure_cugraph_obj_for_nx(G)
-
-    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
-    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
-    if isNx is True and ebunch is not None:
-        vertex_pair = cudf.DataFrame(ebunch)
-
-    df = EXPERIMENTAL__sorensen(G, vertex_pair)
-
-    if isNx is True:
-        df = df_edge_score_to_dictionary(
-            df, k="sorensen_coeff", src="first", dst="second"
-        )
-
-    return df
diff --git a/python/cugraph/cugraph/link_prediction/CMakeLists.txt b/python/cugraph/cugraph/link_prediction/CMakeLists.txt
deleted file mode 100644
index a117cf9afc3..00000000000
--- a/python/cugraph/cugraph/link_prediction/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources jaccard_wrapper.pyx overlap_wrapper.pyx)
-set(linked_libraries cugraph::cugraph)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX link_prediction_
-  ASSOCIATED_TARGETS cugraph
-)
diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py
index a6911d3b8ae..a8517ee7c0f 100644
--- a/python/cugraph/cugraph/link_prediction/__init__.py
+++ b/python/cugraph/cugraph/link_prediction/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,13 +11,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
+from cugraph.utilities.api_tools import deprecated_warning_wrapper
 from cugraph.link_prediction.jaccard import jaccard
 from cugraph.link_prediction.jaccard import jaccard_coefficient
+
+from cugraph.link_prediction.sorensen import sorensen
+from cugraph.link_prediction.sorensen import sorensen_coefficient
+
 from cugraph.link_prediction.overlap import overlap
+from cugraph.link_prediction.overlap import overlap_coefficient
+
+# To be deprecated
 from cugraph.link_prediction.wjaccard import jaccard_w
+
+jaccard_w = deprecated_warning_wrapper(jaccard_w)
+
 from cugraph.link_prediction.woverlap import overlap_w
+
+overlap_w = deprecated_warning_wrapper(overlap_w)
+
 from cugraph.link_prediction.wsorensen import sorensen_w
-from cugraph.link_prediction.jaccard import jaccard_coefficient
-from cugraph.link_prediction.sorensen import sorensen_coefficient
-from cugraph.link_prediction.sorensen import sorensen
-from cugraph.link_prediction.overlap import overlap_coefficient
+
+sorensen_w = deprecated_warning_wrapper(sorensen_w)
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.pxd b/python/cugraph/cugraph/link_prediction/jaccard.pxd
deleted file mode 100644
index 9e8c82ec3d8..00000000000
--- a/python/cugraph/cugraph/link_prediction/jaccard.pxd
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.structure.graph_primtypes cimport *
-
-
-cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
-
-    cdef void jaccard[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        WT *result) except +
-
-    cdef void jaccard_list[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        ET num_pairs,
-        const VT *first,
-        const VT *second,
-        WT *result) except +
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
index 334d57f9d80..27bfa58e6b0 100644
--- a/python/cugraph/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -11,16 +11,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cudf
-from cugraph.link_prediction import jaccard_wrapper
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     df_edge_score_to_dictionary,
     renumber_vertex_pair,
 )
+import cudf
+import warnings
+from typing import Union, Iterable
+
+from pylibcugraph import (
+    jaccard_coefficients as pylibcugraph_jaccard_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
+
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
 
 
-def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
+def jaccard(
+    input_graph: Graph,
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+    use_weight: bool = False,
+):
     """
     Compute the Jaccard similarity between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -36,13 +74,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     of cugraph.jaccard is different from the behavior of
     networkx.jaccard_coefficient.
 
-    This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
-
     cugraph.jaccard, in the absence of a specified vertex pair list, will
-    use the edges of the graph to construct a vertex pair list and will
-    return the jaccard coefficient for those vertex pairs.
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the jaccard coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets
 
     networkx.jaccard_coefficient, in the absence of a specified vertex
     pair list, will return an upper triangular dense matrix, excluding
@@ -59,9 +95,9 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     solution by doing the following:
 
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> pairs = G.get_two_hop_neighbors()
-    >>> df = cugraph.jaccard(G, pairs)
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> pairs = input_graph.get_two_hop_neighbors()
+    >>> df = cugraph.jaccard(input_graph, pairs)
 
     But please remember that cugraph will fill the dataframe with the entire
     solution you request, so you'll need enough memory to store the 2-hop
@@ -72,10 +108,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     ----------
     input_graph : cugraph.Graph
         cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
+        as an edge list. The graph should be undirected where an undirected
+        edge is represented by a directed edge in both direction.The adjacency
+        list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
 
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
@@ -84,9 +121,20 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
         current implementation computes the jaccard coefficient for all
         adjacent vertices in the graph.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted jaccard (if use_weight==True)
+        or un-weighted jaccard (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
 
     Returns
     -------
@@ -99,7 +147,7 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
         df['first'] : cudf.Series
             The first vertex ID of each pair (will be identical to first if specified).
         df['second'] : cudf.Series
-            the second vertex ID of each pair (will be identical to second if
+            The second vertex ID of each pair (will be identical to second if
             specified).
         df['jaccard_coeff'] : cudf.Series
             The computed Jaccard coefficient between the first and the second
@@ -108,65 +156,101 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.jaccard(G)
+    >>> from cugraph import jaccard
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = jaccard(input_graph)
 
     """
     if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
 
     if input_graph.is_directed():
         raise ValueError("Input must be an undirected Graph.")
-    if type(vertex_pair) == cudf.DataFrame:
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    v_p_num_col = len(vertex_pair.columns)
+
+    if isinstance(vertex_pair, cudf.DataFrame):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+        vertex_pair = ensure_valid_dtype(input_graph, vertex_pair)
+        src_col_name = vertex_pair.columns[0]
+        dst_col_name = vertex_pair.columns[1]
+        first = vertex_pair[src_col_name]
+        second = vertex_pair[dst_col_name]
+
     elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
+        raise ValueError("vertex_pair must be a cudf Dataframe")
 
-    df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
+    first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=False,
+    )
 
     if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, src_col_name, preserve_order=True
+        )
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, dst_col_name, preserve_order=True
+        )
+
+    if v_p_num_col == 2:
+        # single column vertex
+        vertex_pair = vertex_pair.rename(
+            columns={src_col_name: "first", dst_col_name: "second"}
+        )
+
+    df = vertex_pair
+    df["jaccard_coeff"] = cudf.Series(jaccard_coeff)
 
     return df
 
 
-def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
+def jaccard_coefficient(
+    G: Union[Graph, "networkx.Graph"],
+    ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     For NetworkX Compatability.  See `jaccard`
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
-
     Parameters
     ----------
-    graph : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
+    G : cugraph.Graph or NetworkX.Graph
+        cuGraph or NetworkX Graph instance, should contain the connectivity
+        information as an edge list. The graph should be undirected where an
+        undirected edge is represented by a directed edge in both direction.
+        The adjacency list will be computed if not already present.
 
-    ebunch : cudf.DataFrame, optional (default=None)
+        This implementation only supports undirected, non-multi Graphs.
+
+    ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the jaccard coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the jaccard coefficient for all
-        adjacent vertices in the graph.
+        vertices or iterable of 2-tuples (u, v) where u and v are nodes in
+        the graph.
+
+        If provided, the Overlap coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the overlap
+        coefficient for all adjacent vertices in the graph.
+
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+                This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -188,10 +272,18 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
+    >>> from cugraph import jaccard_coefficient
     >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.jaccard_coefficient(G)
+    >>> df = jaccard_coefficient(G)
 
     """
+    if do_expensive_check:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
     vertex_pair = None
 
     G, isNx = ensure_cugraph_obj_for_nx(G)
diff --git a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx b/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx
deleted file mode 100644
index e66d8bf0b5c..00000000000
--- a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.link_prediction.jaccard cimport jaccard as c_jaccard
-from cugraph.link_prediction.jaccard cimport jaccard_list as c_jaccard_list
-from cugraph.structure.graph_primtypes cimport *
-from cugraph.structure import graph_primtypes_wrapper
-from libc.stdint cimport uintptr_t
-import cudf
-import numpy as np
-
-
-def jaccard(input_graph, weights_arr=None, vertex_pair=None):
-    """
-    Call jaccard or jaccard_list
-    """
-    offsets = None
-    indices = None
-
-    if input_graph.adjlist:
-        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                                    input_graph.adjlist.indices], [np.int32])
-    elif input_graph.transposedadjlist:
-        #
-        # NOTE: jaccard ONLY operates on an undirected graph, so CSR and CSC should be
-        #       equivalent.  The undirected check has already happened, so we'll just use
-        #       the CSC as if it were CSR.
-        #
-        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets,
-                                                                    input_graph.transposedadjlist.indices], [np.int32])
-    else:
-        input_graph.view_adj_list()
-        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                                    input_graph.adjlist.indices], [np.int32])
-
-    num_verts = input_graph.number_of_vertices()
-    num_edges = input_graph.number_of_edges(directed_edges=True)
-
-    first = None
-    second = None
-
-    cdef uintptr_t c_result_col = <uintptr_t> NULL
-    cdef uintptr_t c_first_col = <uintptr_t> NULL
-    cdef uintptr_t c_second_col = <uintptr_t> NULL
-    cdef uintptr_t c_src_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_dst_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_weights = <uintptr_t> NULL
-    cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
-
-    cdef GraphCSRView[int,int,float] graph_float
-    cdef GraphCSRView[int,int,double] graph_double
-
-    weight_type = np.float32
-
-    if weights_arr is not None:
-        [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
-        c_weights = weights.__cuda_array_interface__['data'][0]
-        weight_type = weights.dtype
-
-    if type(vertex_pair) == cudf.DataFrame:
-        result_size = len(vertex_pair)
-        result = cudf.Series(np.ones(result_size, dtype=weight_type))
-        c_result_col = result.__cuda_array_interface__['data'][0]
-
-        df = cudf.DataFrame()
-        df['jaccard_coeff'] = result
-
-        cols = vertex_pair.columns.to_list()
-        first = vertex_pair[cols[0]].astype(np.int32)
-        second = vertex_pair[cols[1]].astype(np.int32)
-
-        # FIXME: multi column support
-        df['first'] = first
-        df['second'] = second
-        c_first_col = first.__cuda_array_interface__['data'][0]
-        c_second_col = second.__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices,
-                                                  <float*>c_weights, num_verts, num_edges)
-            c_jaccard_list[int,int,float](graph_float,
-                                          <float*>c_weights,
-                                          result_size,
-                                          <int*>c_first_col,
-                                          <int*>c_second_col,
-                                          <float*>c_result_col)
-        else:
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets, <int*>c_indices,
-                                                    <double*>c_weights, num_verts, num_edges)
-            c_jaccard_list[int,int,double](graph_double,
-                                           <double*>c_weights,
-                                           result_size,
-                                           <int*>c_first_col,
-                                           <int*>c_second_col,
-                                           <double*>c_result_col)
-
-        return df
-    else:
-        # error check performed in jaccard.py
-        assert vertex_pair is None
-
-        df = cudf.DataFrame()
-        df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype))
-        df['second'] = indices
-
-        c_src_index_col = df['first'].__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32),
-                                              nan_as_null=False)
-            c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets,
-                                                  <int*>c_indices,
-                                                  <float*>c_weights,
-                                                  num_verts,
-                                                  num_edges)
-            c_jaccard[int,int,float](graph_float,
-                                     <float*>c_weights,
-                                     <float*>c_result_col)
-
-            graph_float.get_source_indices(<int*>c_src_index_col)
-        else:
-            df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64),
-                                              nan_as_null=False)
-            c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets,
-                                                    <int*>c_indices,
-                                                    <double*>c_weights,
-                                                    num_verts,
-                                                    num_edges)
-            c_jaccard[int,int,double](graph_double,
-                                      <double*>c_weights,
-                                      <double*>c_result_col)
-
-            graph_double.get_source_indices(<int*>c_src_index_col)
-
-        return df
diff --git a/python/cugraph/cugraph/link_prediction/overlap.pxd b/python/cugraph/cugraph/link_prediction/overlap.pxd
deleted file mode 100644
index f0654472587..00000000000
--- a/python/cugraph/cugraph/link_prediction/overlap.pxd
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.structure.graph_primtypes cimport *
-
-
-cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
-
-    cdef void overlap[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        WT *result) except +
-
-    cdef void overlap_list[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        ET num_pairs,
-        const VT *first,
-        const VT *second,
-        WT *result) except +
diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py
index 9bb7b76b0ca..3a25526679c 100644
--- a/python/cugraph/cugraph/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/link_prediction/overlap.py
@@ -11,28 +11,120 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.link_prediction import overlap_wrapper
-import cudf
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     df_edge_score_to_dictionary,
     renumber_vertex_pair,
 )
+import cudf
+import warnings
+from typing import Union, Iterable
+
+from pylibcugraph import (
+    overlap_coefficients as pylibcugraph_overlap_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
 
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
 
-def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
+
+
+def overlap_coefficient(
+    G: Union[Graph, "networkx.Graph"],
+    ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
-    For NetworkX Compatability.  See `overlap`
+    Compute overlap coefficient.
+
+    Parameters
+    ----------
+    G : cugraph.Graph or NetworkX.Graph
+        cuGraph or NetworkX Graph instance, should contain the connectivity
+        information as an edge list. The graph should be undirected where an
+        undirected edge is represented by a directed edge in both direction.
+        The adjacency list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi edge Graph.
+
+    ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices or iterable of 2-tuples (u, v) where u and v are nodes in
+        the graph.
+
+        If provided, the Overlap coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the overlap
+        coefficient for all adjacent vertices in the graph.
+
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the overlap weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['overlap_coeff']: dask_cudf.Series
+            The computed overlap coefficient between the first and the second
+            vertex ID.
 
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import overlap_coefficient
+    >>> G = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = overlap_coefficient(G)
     """
+    if do_expensive_check:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
     vertex_pair = None
 
     G, isNx = ensure_cugraph_obj_for_nx(G)
 
+    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
+    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
     if isNx is True and ebunch is not None:
         vertex_pair = cudf.DataFrame(ebunch)
 
@@ -46,7 +138,12 @@ def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
     return df
 
 
-def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
+def overlap(
+    input_graph: Graph,
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+    use_weight: bool = False,
+):
     """
     Compute the Overlap Coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -58,25 +155,39 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
+    cugraph.overlap, in the absence of a specified vertex pair list, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the overlap coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets
 
     Parameters
     ----------
     input_graph : cugraph.Graph
         cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        adjacency list will be computed if not already present.
+        as an edge list. The adjacency list will be computed if not already
+        present.
 
+        This implementation only supports undirected, non-multi edge Graph.
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted overlap (if use_weight==True)
+        or un-weighted overlap (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+
 
     Returns
     -------
@@ -98,35 +209,62 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.overlap(G)
+    >>> from cugraph import overlap
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = overlap(input_graph)
 
     """
     if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(vertex_pair) == cudf.DataFrame:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    v_p_num_col = len(vertex_pair.columns)
+
+    if isinstance(vertex_pair, cudf.DataFrame):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+        vertex_pair = ensure_valid_dtype(input_graph, vertex_pair)
+        src_col_name = vertex_pair.columns[0]
+        dst_col_name = vertex_pair.columns[1]
+        first = vertex_pair[src_col_name]
+        second = vertex_pair[dst_col_name]
+
     elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
-    df = overlap_wrapper.overlap(input_graph, None, vertex_pair)
+    first, second, overlap_coeff = pylibcugraph_overlap_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=False,
+    )
 
     if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, src_col_name, preserve_order=True
+        )
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, dst_col_name, preserve_order=True
+        )
+
+    if v_p_num_col == 2:
+        # single column vertex
+        vertex_pair = vertex_pair.rename(
+            columns={src_col_name: "first", dst_col_name: "second"}
+        )
+
+    df = vertex_pair
+    df["overlap_coeff"] = cudf.Series(overlap_coeff)
 
     return df
diff --git a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx b/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx
deleted file mode 100644
index 0f61460a72f..00000000000
--- a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.link_prediction.overlap cimport overlap as c_overlap
-from cugraph.link_prediction.overlap cimport overlap_list as c_overlap_list
-from cugraph.structure.graph_primtypes cimport *
-from cugraph.structure import graph_primtypes_wrapper
-from libc.stdint cimport uintptr_t
-import cudf
-import numpy as np
-
-
-def overlap(input_graph, weights_arr=None, vertex_pair=None):
-    """
-    Call overlap or overlap_list
-    """
-
-    if not input_graph.adjlist:
-        input_graph.view_adj_list()
-
-    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
-
-    num_verts = input_graph.number_of_vertices()
-    num_edges = input_graph.number_of_edges(directed_edges=True)
-
-    first = None
-    second = None
-
-    cdef uintptr_t c_result_col = <uintptr_t> NULL
-    cdef uintptr_t c_first_col = <uintptr_t> NULL
-    cdef uintptr_t c_second_col = <uintptr_t> NULL
-    cdef uintptr_t c_src_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_dst_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_weights = <uintptr_t> NULL
-    cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
-
-    cdef GraphCSRView[int,int,float] graph_float
-    cdef GraphCSRView[int,int,double] graph_double
-
-    weight_type = np.float32
-
-    if weights_arr is not None:
-        [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
-        c_weights = weights.__cuda_array_interface__['data'][0]
-        weight_type = weights.dtype
-
-    if type(vertex_pair) == cudf.DataFrame:
-        result_size = len(vertex_pair)
-        result = cudf.Series(np.ones(result_size, dtype=np.float32))
-        c_result_col = result.__cuda_array_interface__['data'][0]
-
-        df = cudf.DataFrame()
-        df['overlap_coeff'] = result
-
-        cols = vertex_pair.columns.to_list()
-        first = vertex_pair[cols[0]]
-        second = vertex_pair[cols[1]]
-
-        # FIXME: multi column support
-        df['first'] = first
-        df['second'] = second
-        c_first_col = first.__cuda_array_interface__['data'][0]
-        c_second_col = second.__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices,
-                                                  <float*>c_weights, num_verts, num_edges)
-            c_overlap_list[int,int,float](graph_float,
-                                          <float*>c_weights,
-                                          result_size,
-                                          <int*>c_first_col,
-                                          <int*>c_second_col,
-                                          <float*>c_result_col)
-        else:
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets, <int*>c_indices,
-                                                    <double*>c_weights, num_verts, num_edges)
-            c_overlap_list[int,int,double](graph_double,
-                                           <double*>c_weights,
-                                           result_size,
-                                           <int*>c_first_col,
-                                           <int*>c_second_col,
-                                           <double*>c_result_col)
-
-        return df
-    else:
-        # error check performed in overlap.py
-        assert vertex_pair is None
-
-        df = cudf.DataFrame()
-        df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype))
-        df['second'] = indices
-
-        c_src_index_col = df['first'].__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32),
-                                              nan_as_null=False)
-            c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets,
-                                                  <int*>c_indices,
-                                                  <float*>c_weights,
-                                                  num_verts,
-                                                  num_edges)
-            c_overlap[int,int,float](graph_float,
-                                     <float*>c_weights,
-                                     <float*>c_result_col)
-
-            graph_float.get_source_indices(<int*>c_src_index_col)
-        else:
-            df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64),
-                                              nan_as_null=False)
-            c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets,
-                                                    <int*>c_indices,
-                                                    <double*>c_weights,
-                                                    num_verts,
-                                                    num_edges)
-            c_overlap[int,int,double](graph_double,
-                                      <double*>c_weights,
-                                      <double*>c_result_col)
-
-            graph_double.get_source_indices(<int*>c_src_index_col)
-
-        return df
diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
index ef2bd8d674d..a8ccced1e68 100644
--- a/python/cugraph/cugraph/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -11,17 +11,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cudf
-from cugraph.structure.graph_classes import Graph
-from cugraph.link_prediction import jaccard_wrapper
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     df_edge_score_to_dictionary,
     renumber_vertex_pair,
 )
+import cudf
+import warnings
+from typing import Union, Iterable
+
+from pylibcugraph import (
+    sorensen_coefficients as pylibcugraph_sorensen_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
+
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
 
 
-def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
+def sorensen(
+    input_graph: Graph,
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+    use_weight: bool = False,
+):
     """
     Compute the Sorensen coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -30,22 +67,20 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
     If first is specified but second is not, or vice versa, an exception will
     be thrown.
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
-
     cugraph.sorensen, in the absence of a specified vertex pair list, will
-    use the edges of the graph to construct a vertex pair list and will
-    return the sorensen coefficient for those vertex pairs.
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the sorensen coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets
 
     Parameters
     ----------
     input_graph : cugraph.Graph
         cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
+        as an edge list. The adjacency list will be computed if not already
+        present.
+
+        This implementation only supports undirected, non-multi edge Graph.
 
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
@@ -54,9 +89,18 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
         current implementation computes the Sorensen coefficient for all
         adjacent vertices in the graph.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted sorensen (if use_weight==True)
+        or un-weighted sorensen (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -67,79 +111,112 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
         pairs.
 
         df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified)
-
+            The first vertex ID of each pair (will be identical to first if specified).
         df['second'] : cudf.Series
             The second vertex ID of each pair (will be identical to second if
-            specified)
-
+            specified).
         df['sorensen_coeff'] : cudf.Series
-            The computed Sorensen coefficient between the first and the second
+            The computed sorensen coefficient between the first and the second
             vertex ID.
 
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.sorensen(G)
+    >>> from cugraph import sorensen
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = sorensen(input_graph)
 
     """
     if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(input_graph) is not Graph:
-        raise TypeError("input graph must a Graph")
-
-    if type(vertex_pair) == cudf.DataFrame:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    v_p_num_col = len(vertex_pair.columns)
+
+    if isinstance(vertex_pair, cudf.DataFrame):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+        vertex_pair = ensure_valid_dtype(input_graph, vertex_pair)
+        src_col_name = vertex_pair.columns[0]
+        dst_col_name = vertex_pair.columns[1]
+        first = vertex_pair[src_col_name]
+        second = vertex_pair[dst_col_name]
+
     elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
-    df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
-    df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff)
-    df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True)
+    first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=False,
+    )
+
     if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, src_col_name, preserve_order=True
+        )
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, dst_col_name, preserve_order=True
+        )
+
+    if v_p_num_col == 2:
+        # single column vertex
+        vertex_pair = vertex_pair.rename(
+            columns={src_col_name: "first", dst_col_name: "second"}
+        )
+
+    df = vertex_pair
+    df["sorensen_coeff"] = cudf.Series(sorensen_coeff)
 
     return df
 
 
-def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
+def sorensen_coefficient(
+    G: Union[Graph, "networkx.Graph"],
+    ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
-    For NetworkX Compatability.  See `sorensen`
-
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
+    Compute sorensen coefficient.
 
     Parameters
     ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-    ebunch : cudf.DataFrame, optional (default=None)
+    G : cugraph.Graph or NetworkX.Graph
+        cuGraph or NetworkX Graph instance, should contain the connectivity
+        information as an edge list. The graph should be undirected where an
+        undirected edge is represented by a directed edge in both direction.
+        The adjacency list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the sorensen coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the sorensen coefficient for all
-        adjacent vertices in the graph.
+        vertices or iterable of 2-tuples (u, v) where u and v are nodes in
+        the graph.
+
+        If provided, the Overlap coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the overlap
+        coefficient for all adjacent vertices in the graph.
+
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -152,7 +229,7 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
         df['first'] : cudf.Series
             The first vertex ID of each pair (will be identical to first if specified).
         df['second'] : cudf.Series
-            the second vertex ID of each pair (will be identical to second if
+            The second vertex ID of each pair (will be identical to second if
             specified).
         df['sorensen_coeff'] : cudf.Series
             The computed Sorensen coefficient between the first and the second
@@ -161,14 +238,24 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.sorensen_coefficient(G)
+    >>> from cugraph import sorensen_coefficient
+    >>> G = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = sorensen_coefficient(G)
 
     """
+    if do_expensive_check:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
     vertex_pair = None
 
     G, isNx = ensure_cugraph_obj_for_nx(G)
 
+    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
+    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
     if isNx is True and ebunch is not None:
         vertex_pair = cudf.DataFrame(ebunch)
 
diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py
index e3486473fe5..ec538bbc0ed 100644
--- a/python/cugraph/cugraph/link_prediction/wjaccard.py
+++ b/python/cugraph/cugraph/link_prediction/wjaccard.py
@@ -11,13 +11,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph_classes import Graph
-from cugraph.link_prediction import jaccard_wrapper
+from cugraph.link_prediction import jaccard
 import cudf
-from cugraph.utilities import renumber_vertex_pair
+import warnings
 
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
 
-def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
+
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
+
+
+def jaccard_w(
+    input_graph: Graph,
+    weights: cudf.DataFrame = None,  # deprecated
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     Compute the weighted Jaccard similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -55,9 +87,13 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
         vertices. If provided, the jaccard coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+                This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -95,47 +131,9 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     >>> df = cugraph.jaccard_w(G, weights)
 
     """
-    if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(input_graph) is not Graph:
-        raise TypeError("input graph must a Graph")
-
-    if type(vertex_pair) == cudf.DataFrame:
-        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    if input_graph.renumbered:
-        # The 'vertex' column of the cudf 'weights' also needs to be renumbered
-        # if the graph was renumbered
-        vertex_size = input_graph.vertex_column_size()
-        # single-column vertices i.e only one src and dst columns
-        if vertex_size == 1:
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex")
-        # multi-column vertices i.e more than one src and dst columns
-        else:
-            cols = weights.columns[:vertex_size].to_list()
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", cols)
-
-    jaccard_weights = weights["weight"]
-    df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)
-
-    if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
-
-    return df
+    warning_msg = (
+        "jaccard_w is deprecated. To compute weighted jaccard, please use "
+        "jaccard(input_graph, vertex_pair=False, use_weight=True)"
+    )
+    warnings.warn(warning_msg, FutureWarning)
+    return jaccard(input_graph, vertex_pair, do_expensive_check, use_weight=True)
diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py
index d7ebc5fc684..5f43ad0670b 100644
--- a/python/cugraph/cugraph/link_prediction/woverlap.py
+++ b/python/cugraph/cugraph/link_prediction/woverlap.py
@@ -11,12 +11,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.link_prediction import overlap_wrapper
+from cugraph.link_prediction import overlap
 import cudf
-from cugraph.utilities import renumber_vertex_pair
+import warnings
 
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
 
-def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+def overlap_w(
+    input_graph: Graph,
+    weights: cudf.DataFrame = None,  # deprecated
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     Compute the weighted Overlap Coefficient between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -55,9 +69,13 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -96,43 +114,9 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     ...                      len(weights['vertex']))]
     >>> df = cugraph.overlap_w(G, weights)
     """
-    if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(vertex_pair) == cudf.DataFrame:
-        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    if input_graph.renumbered:
-        vertex_size = input_graph.vertex_column_size()
-        if vertex_size == 1:
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex")
-        else:
-            cols = weights.columns[:vertex_size].to_list()
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", cols)
-
-    overlap_weights = weights["weight"]
-
-    overlap_weights = overlap_weights.astype("float32")
-
-    df = overlap_wrapper.overlap(input_graph, overlap_weights, vertex_pair)
-
-    if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
-
-    return df
+    warning_msg = (
+        " overlap_w is deprecated. To compute weighted overlap, please use "
+        "overlap(input_graph, vertex_pair=False, use_weight=True)"
+    )
+    warnings.warn(warning_msg, FutureWarning)
+    return overlap(input_graph, vertex_pair, do_expensive_check, use_weight=True)
diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py
index 8337b4602de..ff502b36837 100644
--- a/python/cugraph/cugraph/link_prediction/wsorensen.py
+++ b/python/cugraph/cugraph/link_prediction/wsorensen.py
@@ -11,13 +11,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph_classes import Graph
-from cugraph.link_prediction import jaccard_wrapper
+from cugraph.link_prediction import sorensen
 import cudf
-from cugraph.utilities import renumber_vertex_pair
+import warnings
 
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
 
-def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+def sorensen_w(
+    input_graph: Graph,
+    weights: cudf.DataFrame = None,  # deprecated
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     Compute the weighted Sorensen similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -51,9 +64,13 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
         vertices. If provided, the sorensen coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -93,44 +110,9 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     >>> df = cugraph.sorensen_w(G, weights)
 
     """
-    if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(input_graph) is not Graph:
-        raise TypeError("input graph must a Graph")
-
-    if type(vertex_pair) == cudf.DataFrame:
-        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    if input_graph.renumbered:
-        vertex_size = input_graph.vertex_column_size()
-        if vertex_size == 1:
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex")
-        else:
-            cols = weights.columns[:vertex_size].to_list()
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", cols)
-    jaccard_weights = weights["weight"]
-    df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)
-    df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff)
-    df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True)
-
-    if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
-
-    return df
+    warning_msg = (
+        "sorensen_w is deprecated. To compute weighted sorensen, please use "
+        "sorensen(input_graph, vertex_pair=False, use_weight=True)"
+    )
+    warnings.warn(warning_msg, FutureWarning)
+    return sorensen(input_graph, vertex_pair, use_weight=True)
diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py
index 015c05d1b08..7b04dba82a5 100644
--- a/python/cugraph/cugraph/sampling/random_walks.py
+++ b/python/cugraph/cugraph/sampling/random_walks.py
@@ -25,11 +25,10 @@
 from cugraph.utilities.utils import import_optional
 from typing import Union, Tuple
 
-# FIXME: the networkx.Graph type used in the type annotation for
-# induced_subgraph() is specified using a string literal to avoid depending on
-# and importing networkx. Instead, networkx is imported optionally, which may
-# cause a problem for a type checker if run in an environment where networkx is
-# not installed.
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
 networkx = import_optional("networkx")
 
 
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
index cd883fb88f2..7ce7d263eda 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# FIXME: Can we use global variables for column names instead of hardcoded ones?
+
 import gc
 
 import pytest
@@ -20,12 +22,19 @@
 import cugraph
 from cugraph.datasets import netscience
 from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cugraph.experimental import jaccard as exp_jaccard
-from cudf.testing import assert_series_equal, assert_frame_equal
-from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient
-
+from cudf.testing import assert_series_equal
+from cudf.testing.testing import assert_frame_equal
 
-print("Networkx version : {} ".format(nx.__version__))
+SRC_COL = "0"
+DST_COL = "1"
+VERTEX_PAIR_FIRST_COL = "first"
+VERTEX_PAIR_SECOND_COL = "second"
+JACCARD_COEFF_COL = "jaccard_coeff"
+EDGE_ATT_COL = "weight"
+MULTI_COL_SRC_0_COL = "src_0"
+MULTI_COL_DST_0_COL = "dst_0"
+MULTI_COL_SRC_1_COL = "src_1"
+MULTI_COL_DST_1_COL = "dst_1"
 
 
 # =============================================================================
@@ -38,65 +47,79 @@ def setup_function():
 # =============================================================================
 # Helper functions
 # =============================================================================
-def compare_jaccard_two_hop(G, Gnx, edgevals=True):
+
+
+def compare_jaccard_two_hop(G, Gnx, use_weight=False):
     """
     Compute both cugraph and nx jaccard after extracting the two hop neighbors
     from G and compare both results
     """
     pairs = (
         G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
+        .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL])
         .reset_index(drop=True)
     )
 
-    nx_pairs = list(pairs.to_records(index=False))
-    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
-    nx_coeff = []
-    for u, v, p in preds:
-        # print(u, " ", v, " ", p)
-        nx_coeff.append(p)
     df = cugraph.jaccard(G, pairs)
-    df = df.sort_values(by=["first", "second"]).reset_index(drop=True)
-    if not edgevals:
-        # experimental jaccard currently only supports unweighted graphs
-        df_exp = exp_jaccard(G, pairs)
-        df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True)
-        assert_frame_equal(df, df_exp, check_dtype=False, check_like=True)
+    df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
 
-    assert len(nx_coeff) == len(df)
-    for i in range(len(df)):
-        diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i])
-        assert diff < 1.0e-6
+    if not use_weight:
+        nx_pairs = list(pairs.to_records(index=False))
+        preds = nx.jaccard_coefficient(Gnx, nx_pairs)
+        nx_coeff = []
+        for u, v, p in preds:
+            nx_coeff.append(p)
+
+        assert len(nx_coeff) == len(df)
+        for i in range(len(df)):
+            diff = abs(nx_coeff[i] - df[JACCARD_COEFF_COL].iloc[i])
+            assert diff < 1.0e-6
+    else:
+        # FIXME: compare results against resultset api
+        pass
 
 
-def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None):
+def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False):
     G = cugraph.Graph()
-    G = graph_file.get_graph(ignore_weights=not edgevals)
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
     # If no vertex_pair is passed as input, 'cugraph.jaccard' will
     # compute the 'jaccard_similarity' with the two_hop_neighbor of the
     # entire graph while nx compute with the one_hop_neighbor. For better
     # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.jaccard'
     # and pass it as vertex_pair
-    vertex_pair = input_df.rename(columns={"0": "first", "1": "second"})
-    vertex_pair = vertex_pair[["first", "second"]]
+    if isinstance(input_df, cudf.DataFrame):
+        vertex_pair = input_df.rename(
+            columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL}
+        )
+        vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]]
+    else:
+        vertex_pair = cudf.DataFrame(
+            columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL],
+            dtype=G.edgelist.edgelist_df["src"].dtype,
+        )
 
     # cugraph Jaccard Call
-    df = benchmark_callable(cugraph.jaccard, G, vertex_pair=vertex_pair)
+    df = benchmark_callable(
+        cugraph.jaccard, G, vertex_pair=vertex_pair, use_weight=use_weight
+    )
 
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
+    df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
 
     return (
-        df["first"].to_numpy(),
-        df["second"].to_numpy(),
-        df["jaccard_coeff"].to_numpy(),
+        df[VERTEX_PAIR_FIRST_COL].to_numpy(),
+        df[VERTEX_PAIR_SECOND_COL].to_numpy(),
+        df[JACCARD_COEFF_COL].to_numpy(),
     )
 
 
 def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
+    sources = M[SRC_COL]
+    destinations = M[DST_COL]
     edges = []
     for i in range(len(M)):
         edges.append((sources[i], destinations[i]))
@@ -108,7 +131,11 @@ def networkx_call(M, benchmark_callable=None):
     print("Format conversion ... ")
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M,
+        source=SRC_COL,
+        target=DST_COL,
+        edge_attr=EDGE_ATT_COL,
+        create_using=nx.Graph(),
     )
 
     # Networkx Jaccard Call
@@ -144,118 +171,130 @@ def read_csv(request):
 
 
 @pytest.mark.sg
-def test_jaccard(read_csv, gpubenchmark):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard(read_csv, gpubenchmark, use_weight):
     M_cu, M, graph_file = read_csv
-    cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu)
-    nx_src, nx_dst, nx_coeff = networkx_call(M)
+    cu_src, cu_dst, cu_coeff = cugraph_call(
+        gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight
+    )
+    if not use_weight:
+        nx_src, nx_dst, nx_coeff = networkx_call(M)
 
-    # Calculating mismatch
-    err = 0
-    tol = 1.0e-06
+        # Calculating mismatch
+        err = 0
+        tol = 1.0e-06
 
-    assert len(cu_coeff) == len(nx_coeff)
-    for i in range(len(cu_coeff)):
-        if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
-            err += 1
+        assert len(cu_coeff) == len(nx_coeff)
+        for i in range(len(cu_coeff)):
+            if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
+                err += 1
 
-    print("Mismatches:  %d" % err)
-    assert err == 0
+        print("Mismatches:  %d" % err)
+        assert err == 0
+    else:
+        G = graph_file.get_graph()
+        res_w_jaccard = cugraph.jaccard_w(G, vertex_pair=M_cu[[SRC_COL, DST_COL]])
+        res_w_jaccard = res_w_jaccard.sort_values(
+            [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]
+        ).reset_index(drop=True)
+        res_jaccard = cudf.DataFrame()
+        res_jaccard[VERTEX_PAIR_FIRST_COL] = cu_src
+        res_jaccard[VERTEX_PAIR_SECOND_COL] = cu_dst
+        res_jaccard[JACCARD_COEFF_COL] = cu_coeff
+        assert_frame_equal(
+            res_w_jaccard, res_jaccard, check_dtype=False, check_like=True
+        )
+
+        # FIXME: compare weighted jaccard results against resultset api
 
 
 @pytest.mark.sg
-def test_directed_graph_check(read_csv):
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_directed_graph_check(read_csv, use_weight):
     _, M, _ = read_csv
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[SRC_COL] = cudf.Series(M[SRC_COL])
+    cu_M[DST_COL] = cudf.Series(M[DST_COL])
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph(directed=True)
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
+    weight = EDGE_ATT_COL if use_weight else None
+    G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight)
+
+    vertex_pair = cu_M[[SRC_COL, DST_COL]]
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
     vertex_pair = vertex_pair[:5]
     with pytest.raises(ValueError):
-        cugraph.jaccard(G1, vertex_pair)
+        cugraph.jaccard(G1, vertex_pair, use_weight)
 
 
 @pytest.mark.sg
 def test_nx_jaccard_time(read_csv, gpubenchmark):
-
     _, M, _ = read_csv
     nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", [netscience])
-@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered")
-def test_jaccard_edgevals(gpubenchmark, graph_file):
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard_edgevals(gpubenchmark, graph_file, use_weight):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
     M_cu = utils.read_csv_file(dataset_path)
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, netscience, edgevals=True, input_df=M_cu
+        gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight
     )
-    nx_src, nx_dst, nx_coeff = networkx_call(M)
+    if not use_weight:
+        nx_src, nx_dst, nx_coeff = networkx_call(M)
 
-    # Calculating mismatch
-    err = 0
-    tol = 1.0e-06
-
-    assert len(cu_coeff) == len(nx_coeff)
-    for i in range(len(cu_coeff)):
-        if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
-            err += 1
-
-    print("Mismatches:  %d" % err)
-    assert err == 0
+        # Calculating mismatch
+        err = 0
+        tol = 1.0e-06
 
+        assert len(cu_coeff) == len(nx_coeff)
+        for i in range(len(cu_coeff)):
+            if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
+                err += 1
 
-@pytest.mark.sg
-def test_jaccard_two_hop(read_csv):
-
-    _, M, graph_file = read_csv
-
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    G = graph_file.get_graph(ignore_weights=True)
-
-    compare_jaccard_two_hop(G, Gnx)
+        print("Mismatches:  %d" % err)
+        assert err == 0
+    else:
+        # FIXME: compare results against resultset api
+        pass
 
 
 @pytest.mark.sg
-def test_jaccard_two_hop_edge_vals(read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard_two_hop(read_csv, use_weight):
     _, M, graph_file = read_csv
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M, source=SRC_COL, target=DST_COL, create_using=nx.Graph()
     )
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
-    G = graph_file.get_graph()
-
-    compare_jaccard_two_hop(G, Gnx, edgevals=True)
+    compare_jaccard_two_hop(G, Gnx, use_weight)
 
 
 @pytest.mark.sg
 def test_jaccard_nx(read_csv):
-
     M_cu, M, _ = read_csv
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
+    Gnx = nx.from_pandas_edgelist(
+        M, source=SRC_COL, target=DST_COL, create_using=nx.Graph()
+    )
 
     nx_j = nx.jaccard_coefficient(Gnx)
     nv_js = sorted(nx_j, key=len, reverse=True)
 
-    ebunch = M_cu.rename(columns={"0": "first", "1": "second"})
-    ebunch = ebunch[["first", "second"]]
+    ebunch = M_cu.rename(
+        columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL}
+    )
+    ebunch = ebunch[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]]
     cg_j = cugraph.jaccard_coefficient(Gnx, ebunch=ebunch)
-    cg_j_exp = exp_jaccard_coefficient(Gnx, ebunch=ebunch)
 
     assert len(nv_js) > len(cg_j)
-    assert len(nv_js) > len(cg_j_exp)
 
     # FIXME:  Nx does a full all-pair Jaccard.
     # cuGraph does a limited 1-hop Jaccard
@@ -263,68 +302,58 @@ def test_jaccard_nx(read_csv):
 
 
 @pytest.mark.sg
-def test_jaccard_multi_column(read_csv):
-
-    _, M, _ = read_csv
+@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard_multi_column(graph_file, use_weight):
+    dataset_path = graph_file.get_path()
+    M = utils.read_csv_for_nx(dataset_path)
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL])
+    cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL])
+    cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000
+    cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph()
+    weight = EDGE_ATT_COL if use_weight else None
     G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
+        cu_M,
+        source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL],
+        destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL],
+        weight=weight,
     )
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = cu_M[
+        [
+            MULTI_COL_SRC_0_COL,
+            MULTI_COL_SRC_1_COL,
+            MULTI_COL_DST_0_COL,
+            MULTI_COL_DST_1_COL,
+        ]
+    ]
     vertex_pair = vertex_pair[:5]
 
-    df_res = cugraph.jaccard(G1, vertex_pair)
-    df_plc_exp = exp_jaccard(G1, vertex_pair)
-
-    df_plc_exp = df_plc_exp.rename(
-        columns={
-            "0_src": "0_source",
-            "0_dst": "0_destination",
-            "1_src": "1_source",
-            "1_dst": "1_destination",
-        }
-    )
-
-    jaccard_res = df_res["jaccard_coeff"].sort_values().reset_index(drop=True)
-    jaccard_plc_exp = df_plc_exp["jaccard_coeff"].sort_values().reset_index(drop=True)
-    assert_series_equal(jaccard_res, jaccard_plc_exp)
+    df_multi_col_res = cugraph.jaccard(G1, vertex_pair)
 
     G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-    df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]])
+    G2.from_cudf_edgelist(
+        cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight
+    )
+    df_single_col_res = cugraph.jaccard(
+        G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]]
+    )
 
     # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
+    actual = df_multi_col_res.sort_values("0_src").reset_index()
+    expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index()
+    assert_series_equal(actual[JACCARD_COEFF_COL], expected[JACCARD_COEFF_COL])
 
 
 @pytest.mark.sg
-def test_weighted_exp_jaccard():
+def test_weighted_jaccard():
     karate = UNDIRECTED_DATASETS[0]
-    G = karate.get_graph()
-    with pytest.raises(ValueError):
-        exp_jaccard(G)
-
     G = karate.get_graph(ignore_weights=True)
-    use_weight = True
-    with pytest.raises(ValueError):
-        exp_jaccard(G, use_weight=use_weight)
-
-
-@pytest.mark.sg
-def test_invalid_datasets_jaccard():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
     with pytest.raises(ValueError):
-        cugraph.jaccard(G)
+        cugraph.jaccard(G, use_weight=True)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
index 586d534cd42..e24deaa61ac 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
@@ -20,8 +20,19 @@
 import cudf
 import cugraph
 from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cugraph.experimental import overlap as exp_overlap
-from cudf.testing import assert_series_equal, assert_frame_equal
+from cudf.testing import assert_series_equal
+from cudf.testing.testing import assert_frame_equal
+
+SRC_COL = "0"
+DST_COL = "1"
+VERTEX_PAIR_FIRST_COL = "first"
+VERTEX_PAIR_SECOND_COL = "second"
+OVERLAP_COEFF_COL = "overlap_coeff"
+EDGE_ATT_COL = "weight"
+MULTI_COL_SRC_0_COL = "src_0"
+MULTI_COL_DST_0_COL = "dst_0"
+MULTI_COL_SRC_1_COL = "src_1"
+MULTI_COL_DST_1_COL = "dst_1"
 
 
 # =============================================================================
@@ -35,7 +46,6 @@ def setup_function():
 # Helper functions
 # =============================================================================
 def compare_overlap(cu_coeff, cpu_coeff):
-
     assert len(cu_coeff) == len(cpu_coeff)
     for i in range(len(cu_coeff)):
         if np.isnan(cpu_coeff[i]):
@@ -47,21 +57,21 @@ def compare_overlap(cu_coeff, cpu_coeff):
             assert diff < 1.0e-6
 
 
-def cugraph_call(benchmark_callable, graph_file, pairs, edgevals=False):
+def cugraph_call(benchmark_callable, graph_file, pairs, use_weight=False):
     # Device data
     G = graph_file.get_graph(
-        create_using=cugraph.Graph(directed=False), ignore_weights=not edgevals
+        create_using=cugraph.Graph(directed=False), ignore_weights=not use_weight
     )
     # cugraph Overlap Call
     df = benchmark_callable(cugraph.overlap, G, pairs)
-    df = df.sort_values(by=["first", "second"]).reset_index(drop=True)
-    if not edgevals:
-        # experimental overlap currently only supports unweighted graphs
-        df_exp = exp_overlap(G, pairs)
-        df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True)
-        assert_frame_equal(df, df_exp, check_dtype=False, check_like=True)
+    df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
+    if use_weight:
+        res_w_overlap = cugraph.overlap_w(G, vertex_pair=pairs)
+        assert_frame_equal(res_w_overlap, df, check_dtype=False, check_like=True)
 
-    return df["overlap_coeff"].to_numpy()
+    return df[OVERLAP_COEFF_COL].to_numpy()
 
 
 def intersection(a, b, M):
@@ -120,8 +130,10 @@ def read_csv(request):
     dataset_path = graph_file.get_path()
     Mnx = utils.read_csv_for_nx(dataset_path)
 
-    N = max(max(Mnx["0"]), max(Mnx["1"])) + 1
-    M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N))
+    N = max(max(Mnx[SRC_COL]), max(Mnx[DST_COL])) + 1
+    M = scipy.sparse.csr_matrix(
+        (Mnx.weight, (Mnx[SRC_COL], Mnx[DST_COL])), shape=(N, N)
+    )
 
     return M, graph_file
 
@@ -135,7 +147,7 @@ def extract_two_hop(read_csv):
     G = graph_file.get_graph(ignore_weights=True)
     pairs = (
         G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
+        .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL])
         .reset_index(drop=True)
     )
 
@@ -144,93 +156,91 @@ def extract_two_hop(read_csv):
 
 # Test
 @pytest.mark.sg
-def test_overlap(gpubenchmark, read_csv, extract_two_hop):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_overlap(gpubenchmark, read_csv, extract_two_hop, use_weight):
     M, graph_file = read_csv
     pairs = extract_two_hop
 
-    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs)
-    cpu_coeff = cpu_call(M, pairs["first"], pairs["second"])
+    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, use_weight=use_weight)
+    cpu_coeff = cpu_call(M, pairs[VERTEX_PAIR_FIRST_COL], pairs[VERTEX_PAIR_SECOND_COL])
 
     compare_overlap(cu_coeff, cpu_coeff)
 
 
-# Test
 @pytest.mark.sg
-def test_overlap_edge_vals(gpubenchmark, read_csv, extract_two_hop):
+@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_directed_graph_check(graph_file, use_weight):
+    M = utils.read_csv_for_nx(graph_file.get_path())
+    cu_M = cudf.DataFrame()
+    cu_M[SRC_COL] = cudf.Series(M[SRC_COL])
+    cu_M[DST_COL] = cudf.Series(M[DST_COL])
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
 
-    M, graph_file = read_csv
-    pairs = extract_two_hop
+    G1 = cugraph.Graph(directed=True)
+    weight = EDGE_ATT_COL if use_weight else None
+    G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight)
 
-    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, edgevals=True)
-    cpu_coeff = cpu_call(M, pairs["first"], pairs["second"])
+    vertex_pair = cu_M[[SRC_COL, DST_COL]]
 
-    compare_overlap(cu_coeff, cpu_coeff)
+    vertex_pair = vertex_pair[:5]
+    with pytest.raises(ValueError):
+        cugraph.overlap(G1, vertex_pair, use_weight)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
-def test_overlap_multi_column(graph_file):
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_overlap_multi_column(graph_file, use_weight):
     dataset_path = graph_file.get_path()
     M = utils.read_csv_for_nx(dataset_path)
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL])
+    cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL])
+    cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000
+    cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph()
+    weight = EDGE_ATT_COL if use_weight else None
     G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
+        cu_M,
+        source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL],
+        destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL],
+        weight=weight,
     )
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = cu_M[
+        [
+            MULTI_COL_SRC_0_COL,
+            MULTI_COL_SRC_1_COL,
+            MULTI_COL_DST_0_COL,
+            MULTI_COL_DST_1_COL,
+        ]
+    ]
     vertex_pair = vertex_pair[:5]
 
-    df_res = cugraph.overlap(G1, vertex_pair)
-    df_plc_exp = exp_overlap(G1, vertex_pair)
-
-    df_plc_exp = df_plc_exp.rename(
-        columns={
-            "0_src": "0_source",
-            "0_dst": "0_destination",
-            "1_src": "1_source",
-            "1_dst": "1_destination",
-        }
-    )
-    overlap_res = df_res["overlap_coeff"].sort_values().reset_index(drop=True)
-    overlap_plc_exp = df_plc_exp["overlap_coeff"].sort_values().reset_index(drop=True)
-    assert_series_equal(overlap_res, overlap_plc_exp)
-
+    df_multi_col_res = cugraph.overlap(G1, vertex_pair, use_weight=use_weight)
     G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-    df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]])
+    G2.from_cudf_edgelist(
+        cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight
+    )
+    df_single_col_res = cugraph.overlap(
+        G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]]
+    )
 
     # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
+    actual = df_multi_col_res.sort_values("0_src").reset_index()
+    expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index()
+    assert_series_equal(actual[OVERLAP_COEFF_COL], expected[OVERLAP_COEFF_COL])
 
 
 @pytest.mark.sg
-def test_weighted_exp_overlap():
+def test_weighted_overlap():
     karate = UNDIRECTED_DATASETS[0]
-    G = karate.get_graph()
-    with pytest.raises(ValueError):
-        exp_overlap(G)
-
     G = karate.get_graph(ignore_weights=True)
-    use_weight = True
-    with pytest.raises(ValueError):
-        exp_overlap(G, use_weight=use_weight)
-
-
-@pytest.mark.sg
-def test_invalid_datasets_overlap():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
     with pytest.raises(ValueError):
-        cugraph.overlap(G)
+        cugraph.overlap(G, use_weight=True)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
index 3da33a3e853..6b4074fce30 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
@@ -20,11 +20,19 @@
 import cugraph
 from cugraph.testing import utils, UNDIRECTED_DATASETS
 from cugraph.datasets import netscience
-from cugraph.experimental import sorensen as exp_sorensen
-from cudf.testing import assert_series_equal, assert_frame_equal
+from cudf.testing import assert_series_equal
+from cudf.testing.testing import assert_frame_equal
 
-
-print("Networkx version : {} ".format(nx.__version__))
+SRC_COL = "0"
+DST_COL = "1"
+VERTEX_PAIR_FIRST_COL = "first"
+VERTEX_PAIR_SECOND_COL = "second"
+SORENSEN_COEFF_COL = "sorensen_coeff"
+EDGE_ATT_COL = "weight"
+MULTI_COL_SRC_0_COL = "src_0"
+MULTI_COL_DST_0_COL = "dst_0"
+MULTI_COL_SRC_1_COL = "src_1"
+MULTI_COL_DST_1_COL = "dst_1"
 
 
 # =============================================================================
@@ -37,68 +45,89 @@ def setup_function():
 # =============================================================================
 # Helper functions
 # =============================================================================
-def compare_sorensen_two_hop(G, Gnx, edgevals=False):
+def compare_sorensen_two_hop(G, Gnx, use_weight=False):
     """
     Compute both cugraph and nx sorensen after extracting the two hop neighbors
     from G and compare both results
     """
     pairs = (
         G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
+        .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL])
         .reset_index(drop=True)
     )
-    nx_pairs = []
-    nx_pairs = list(pairs.to_records(index=False))
-    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
-    nx_coeff = []
-    for u, v, p in preds:
+
+    # print(f'G = {G.edgelist.edgelist_df}')
+
+    df = cugraph.sorensen(G, pairs)
+    df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
+
+    if not use_weight:
+        nx_pairs = list(pairs.to_records(index=False))
+
+        # print(f'nx_pairs = {len(nx_pairs)}')
+
+        preds = nx.jaccard_coefficient(Gnx, nx_pairs)
+
         # FIXME: Use known correct values of Sorensen for few graphs,
         # hardcode it and compare to Cugraph Sorensen to get a more robust test
 
         # Conversion from Networkx Jaccard to Sorensen
         # No networkX equivalent
-        nx_coeff.append((2 * p) / (1 + p))
-    df = cugraph.sorensen(G, pairs)
-    df = df.sort_values(by=["first", "second"]).reset_index(drop=True)
-    if not edgevals:
-        # experimental sorensen currently only supports unweighted graphs
-        df_exp = exp_sorensen(G, pairs)
-        df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True)
-        assert_frame_equal(df, df_exp, check_dtype=False, check_like=True)
-    assert len(nx_coeff) == len(df)
-    for i in range(len(df)):
-        diff = abs(nx_coeff[i] - df["sorensen_coeff"].iloc[i])
-        assert diff < 1.0e-6
-
-
-def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None):
+
+        nx_coeff = list(map(lambda x: (2 * x[2]) / (1 + x[2]), preds))
+
+        assert len(nx_coeff) == len(df)
+        for i in range(len(df)):
+            diff = abs(nx_coeff[i] - df[SORENSEN_COEFF_COL].iloc[i])
+            assert diff < 1.0e-6
+    else:
+        # FIXME: compare results against resultset api
+        res_w_sorensen = cugraph.sorensen_w(G, vertex_pair=pairs)
+        res_w_sorensen = res_w_sorensen.sort_values(
+            [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]
+        ).reset_index(drop=True)
+        assert_frame_equal(res_w_sorensen, df, check_dtype=False, check_like=True)
+
+
+def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False):
     G = cugraph.Graph()
-    G = graph_file.get_graph(ignore_weights=not edgevals)
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
     # If no vertex_pair is passed as input, 'cugraph.sorensen' will
     # compute the 'sorensen_similarity' with the two_hop_neighbor of the
     # entire graph while nx compute with the one_hop_neighbor. For better
     # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.sorensen'
     # and pass it as vertex_pair
-    vertex_pair = input_df.rename(columns={"0": "first", "1": "second"})
-    vertex_pair = vertex_pair[["first", "second"]]
+    if isinstance(input_df, cudf.DataFrame):
+        vertex_pair = input_df.rename(
+            columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL}
+        )
+        vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]]
+    else:
+        vertex_pair = cudf.DataFrame(
+            columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL],
+            dtype=G.edgelist.edgelist_df["src"].dtype,
+        )
 
     # cugraph Sorensen Call
     df = benchmark_callable(cugraph.sorensen, G, vertex_pair=vertex_pair)
 
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
+    df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
 
     return (
-        df["first"].to_numpy(),
-        df["second"].to_numpy(),
-        df["sorensen_coeff"].to_numpy(),
+        df[VERTEX_PAIR_FIRST_COL].to_numpy(),
+        df[VERTEX_PAIR_SECOND_COL].to_numpy(),
+        df[SORENSEN_COEFF_COL].to_numpy(),
     )
 
 
 def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
+    sources = M[SRC_COL]
+    destinations = M[DST_COL]
     edges = []
     for i in range(len(M)):
         edges.append((sources[i], destinations[i]))
@@ -110,7 +139,11 @@ def networkx_call(M, benchmark_callable=None):
     print("Format conversion ... ")
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M,
+        source=SRC_COL,
+        target=DST_COL,
+        edge_attr=EDGE_ATT_COL,
+        create_using=nx.Graph(),
     )
 
     # Networkx Jaccard Call
@@ -149,10 +182,12 @@ def read_csv(request):
 
 
 @pytest.mark.sg
-def test_sorensen(gpubenchmark, read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen(gpubenchmark, read_csv, use_weight):
     M_cu, M, graph_file = read_csv
-    cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu)
+    cu_src, cu_dst, cu_coeff = cugraph_call(
+        gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight
+    )
     nx_src, nx_dst, nx_coeff = networkx_call(M)
 
     # Calculating mismatch
@@ -170,20 +205,42 @@ def test_sorensen(gpubenchmark, read_csv):
 
 @pytest.mark.sg
 def test_nx_sorensen_time(gpubenchmark, read_csv):
-
     _, M, _ = read_csv
     nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark)
 
 
+@pytest.mark.sg
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_directed_graph_check(read_csv, use_weight):
+    _, M, _ = read_csv
+
+    cu_M = cudf.DataFrame()
+    cu_M[SRC_COL] = cudf.Series(M[SRC_COL])
+    cu_M[DST_COL] = cudf.Series(M[DST_COL])
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
+    G1 = cugraph.Graph(directed=True)
+    weight = EDGE_ATT_COL if use_weight else None
+    G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight)
+
+    vertex_pair = cu_M[[SRC_COL, DST_COL]]
+
+    vertex_pair = vertex_pair[:5]
+    with pytest.raises(ValueError):
+        cugraph.sorensen(G1, vertex_pair, use_weight)
+
+
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", [netscience])
+@pytest.mark.parametrize("use_weight", [False, True])
 @pytest.mark.skip(reason="Skipping because this datasets is unrenumbered")
-def test_sorensen_edgevals(gpubenchmark, graph_file):
+def test_sorensen_edgevals(gpubenchmark, graph_file, use_weight):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
     M_cu = utils.read_csv_file(dataset_path)
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, netscience, edgevals=True, input_df=M_cu
+        gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight
     )
     nx_src, nx_dst, nx_coeff = networkx_call(M)
 
@@ -201,92 +258,89 @@ def test_sorensen_edgevals(gpubenchmark, graph_file):
 
 
 @pytest.mark.sg
-def test_sorensen_two_hop(read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen_two_hop(read_csv, use_weight):
     _, M, graph_file = read_csv
 
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    G = graph_file.get_graph(ignore_weights=True)
+    Gnx = nx.from_pandas_edgelist(
+        M, source=SRC_COL, target=DST_COL, create_using=nx.Graph()
+    )
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
-    compare_sorensen_two_hop(G, Gnx)
+    compare_sorensen_two_hop(G, Gnx, use_weight=use_weight)
 
 
 @pytest.mark.sg
-def test_sorensen_two_hop_edge_vals(read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen_two_hop_edge_vals(read_csv, use_weight):
     _, M, graph_file = read_csv
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M,
+        source=SRC_COL,
+        target=DST_COL,
+        edge_attr=EDGE_ATT_COL,
+        create_using=nx.Graph(),
     )
 
-    G = graph_file.get_graph()
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
-    compare_sorensen_two_hop(G, Gnx, edgevals=True)
+    compare_sorensen_two_hop(G, Gnx, use_weight=use_weight)
 
 
 @pytest.mark.sg
-def test_sorensen_multi_column(read_csv):
-
-    _, M, _ = read_csv
+@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen_multi_column(graph_file, use_weight):
+    dataset_path = graph_file.get_path()
+    M = utils.read_csv_for_nx(dataset_path)
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL])
+    cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL])
+    cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000
+    cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph()
+    weight = EDGE_ATT_COL if use_weight else None
     G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
+        cu_M,
+        source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL],
+        destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL],
+        weight=weight,
     )
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = cu_M[
+        [
+            MULTI_COL_SRC_0_COL,
+            MULTI_COL_SRC_1_COL,
+            MULTI_COL_DST_0_COL,
+            MULTI_COL_DST_1_COL,
+        ]
+    ]
     vertex_pair = vertex_pair[:5]
 
-    df_res = cugraph.sorensen(G1, vertex_pair)
-    df_plc_exp = exp_sorensen(G1, vertex_pair)
-
-    df_plc_exp = df_plc_exp.rename(
-        columns={
-            "0_src": "0_source",
-            "0_dst": "0_destination",
-            "1_src": "1_source",
-            "1_dst": "1_destination",
-        }
-    )
-    sorensen_res = df_res["sorensen_coeff"].sort_values().reset_index(drop=True)
-    sorensen_plc_exp = df_plc_exp["sorensen_coeff"].sort_values().reset_index(drop=True)
-    assert_series_equal(sorensen_res, sorensen_plc_exp)
+    df_multi_col_res = cugraph.sorensen(G1, vertex_pair)
 
     G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-    df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]])
+    G2.from_cudf_edgelist(
+        cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight
+    )
+    df_single_col_res = cugraph.sorensen(
+        G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]]
+    )
 
     # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
+    actual = df_multi_col_res.sort_values("0_src").reset_index()
+    expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index()
+    assert_series_equal(actual[SORENSEN_COEFF_COL], expected[SORENSEN_COEFF_COL])
 
 
 @pytest.mark.sg
-def test_weighted_exp_sorensen():
+def test_weighted_sorensen():
     karate = UNDIRECTED_DATASETS[0]
-    G = karate.get_graph()
-    with pytest.raises(ValueError):
-        exp_sorensen(G)
-
     G = karate.get_graph(ignore_weights=True)
-    use_weight = True
-    with pytest.raises(ValueError):
-        exp_sorensen(G, use_weight=use_weight)
-
-
-@pytest.mark.sg
-def test_invalid_datasets_sorensen():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
     with pytest.raises(ValueError):
-        cugraph.sorensen(G)
+        cugraph.sorensen(G, use_weight=True)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
deleted file mode 100644
index 36a21df46b8..00000000000
--- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import numpy as np
-import networkx as nx
-
-import cudf
-import cugraph
-from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cudf.testing import assert_series_equal
-
-
-print("Networkx version : {} ".format(nx.__version__))
-
-
-# =============================================================================
-# Pytest Setup / Teardown - called for each test function
-# =============================================================================
-def setup_function():
-    gc.collect()
-
-
-def cugraph_call(benchmark_callable, graph_file):
-    # Device data
-    cu_M = graph_file.get_edgelist()
-    weight_arr = cudf.Series(
-        np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32)
-    )
-    weights = cudf.DataFrame()
-    weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32)
-    weights["weight"] = weight_arr
-
-    G = graph_file.get_graph(ignore_weights=True)
-
-    # cugraph Jaccard Call
-    df = benchmark_callable(cugraph.jaccard_w, G, weights)
-
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
-
-    return df["jaccard_coeff"]
-
-
-def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
-    edges = []
-    for i in range(len(sources)):
-        edges.append((sources[i], destinations[i]))
-        edges.append((destinations[i], sources[i]))
-    edges = list(dict.fromkeys(edges))
-    edges = sorted(edges)
-    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
-    # explicitly
-    print("Format conversion ... ")
-
-    # NetworkX graph
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    # Networkx Jaccard Call
-    print("Solving... ")
-    if benchmark_callable is not None:
-        preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges)
-    else:
-        preds = nx.jaccard_coefficient(Gnx, edges)
-
-    coeff = []
-    for u, v, p in preds:
-        coeff.append(p)
-    return coeff
-
-
-# =============================================================================
-# Pytest Fixtures
-# =============================================================================
-@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS)
-def read_csv(request):
-    """
-    Read csv file for both networkx and cugraph
-    """
-    graph_file = request.param
-    dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path)
-
-    return M, graph_file
-
-
-@pytest.mark.sg
-def test_wjaccard(gpubenchmark, read_csv):
-
-    M, graph_file = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_nx_wjaccard_time(gpubenchmark, read_csv):
-
-    M, _ = read_csv
-    networkx_call(M, gpubenchmark)
-
-
-@pytest.mark.sg
-def test_wjaccard_multi_column_weights(gpubenchmark, read_csv):
-
-    M, graph_file = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_wjaccard_multi_column(read_csv):
-
-    M, _ = read_csv
-
-    cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-    G1 = cugraph.Graph()
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
-
-    G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
-    vertex_pair = vertex_pair[:5]
-
-    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32))
-    weights = cudf.DataFrame()
-    weights["vertex"] = G2.nodes()
-    weights["vertex_"] = weights["vertex"] + 1000
-    weights["weight"] = weight_arr
-
-    df_res = cugraph.jaccard_w(G1, weights, vertex_pair)
-
-    weights = weights[["vertex", "weight"]]
-    df_exp = cugraph.jaccard_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
-
-    # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
-
-
-@pytest.mark.sg
-def test_invalid_datasets_jaccard_w():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
-    with pytest.raises(ValueError):
-        cugraph.jaccard_w(G, None)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
deleted file mode 100644
index 1dffb9fca41..00000000000
--- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import scipy
-import numpy as np
-
-import cudf
-import cugraph
-from cudf.testing import assert_series_equal
-from cugraph.testing import utils, UNDIRECTED_DATASETS
-
-
-# =============================================================================
-# Pytest Setup / Teardown - called for each test function
-# =============================================================================
-def setup_function():
-    gc.collect()
-
-
-def cugraph_call(benchmark_callable, graph_file, pairs):
-    # Device data
-    cu_M = graph_file.get_edgelist()
-    weights_arr = cudf.Series(
-        np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32)
-    )
-    weights = cudf.DataFrame()
-    weights["vertex"] = np.arange(len(weights_arr), dtype=np.int32)
-    weights["weight"] = weights_arr
-
-    G = graph_file.get_graph(create_using=cugraph.Graph(directed=True))
-
-    # cugraph Overlap Call
-    df = benchmark_callable(cugraph.overlap_w, G, weights, pairs)
-
-    df = df.sort_values(by=["first", "second"])
-    return df["overlap_coeff"].to_numpy()
-
-
-def intersection(a, b, M):
-    count = 0
-    a_idx = M.indptr[a]
-    b_idx = M.indptr[b]
-
-    while (a_idx < M.indptr[a + 1]) and (b_idx < M.indptr[b + 1]):
-        a_vertex = M.indices[a_idx]
-        b_vertex = M.indices[b_idx]
-
-        if a_vertex == b_vertex:
-            count += 1
-            a_idx += 1
-            b_idx += 1
-        elif a_vertex < b_vertex:
-            a_idx += 1
-        else:
-            b_idx += 1
-
-    return count
-
-
-def degree(a, M):
-    return M.indptr[a + 1] - M.indptr[a]
-
-
-def overlap(a, b, M):
-    b_sum = degree(b, M)
-    if b_sum == 0:
-        return float("NaN")
-
-    i = intersection(a, b, M)
-    a_sum = degree(a, M)
-    total = min(a_sum, b_sum)
-    return i / total
-
-
-def cpu_call(M, first, second):
-    result = []
-    for i in range(len(first)):
-        result.append(overlap(first[i], second[i], M))
-    return result
-
-
-@pytest.mark.sg
-@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
-def test_woverlap(gpubenchmark, graph_file):
-    dataset_path = graph_file.get_path()
-    Mnx = utils.read_csv_for_nx(dataset_path)
-    N = max(max(Mnx["0"]), max(Mnx["1"])) + 1
-    M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N))
-
-    G = graph_file.get_graph(ignore_weights=True)
-    pairs = (
-        G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
-        .reset_index(drop=True)
-    )
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs)
-    cpu_coeff = cpu_call(M, pairs["first"], pairs["second"])
-    assert len(cu_coeff) == len(cpu_coeff)
-    for i in range(len(cu_coeff)):
-        if np.isnan(cpu_coeff[i]):
-            assert np.isnan(cu_coeff[i])
-        elif np.isnan(cu_coeff[i]):
-            assert cpu_coeff[i] == cu_coeff[i]
-        else:
-            diff = abs(cpu_coeff[i] - cu_coeff[i])
-            assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
-def test_woverlap_multi_column(graph_file):
-    dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path)
-
-    cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-    G1 = cugraph.Graph()
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
-
-    G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
-    vertex_pair = vertex_pair[:5]
-
-    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32))
-
-    weights = cudf.DataFrame()
-    weights["vertex"] = G2.nodes()
-    weights["vertex_"] = weights["vertex"] + 1000
-    weights["weight"] = weight_arr
-
-    df_res = cugraph.overlap_w(G1, weights, vertex_pair)
-
-    weights = weights[["vertex", "weight"]]
-    df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
-
-    # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
-
-
-@pytest.mark.sg
-def test_invalid_datasets_overlap_w():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
-    with pytest.raises(ValueError):
-        cugraph.overlap_w(G, None)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py
deleted file mode 100644
index 8d09b3e25b3..00000000000
--- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import numpy as np
-import networkx as nx
-
-import cudf
-import cugraph
-from cudf.testing import assert_series_equal
-from cugraph.testing import utils, UNDIRECTED_DATASETS
-
-
-print("Networkx version : {} ".format(nx.__version__))
-
-
-# =============================================================================
-# Pytest Setup / Teardown - called for each test function
-# =============================================================================
-def setup_function():
-    gc.collect()
-
-
-def cugraph_call(benchmark_callable, graph_file):
-    # Device data
-    cu_M = graph_file.get_edgelist()
-    weight_arr = cudf.Series(
-        np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32)
-    )
-    weights = cudf.DataFrame()
-    weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32)
-    weights["weight"] = weight_arr
-
-    G = graph_file.get_graph(ignore_weights=True)
-
-    # cugraph Sorensen Call
-    df = benchmark_callable(cugraph.sorensen_w, G, weights)
-
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
-
-    return df["sorensen_coeff"]
-
-
-def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
-    edges = []
-    for i in range(len(sources)):
-        edges.append((sources[i], destinations[i]))
-        edges.append((destinations[i], sources[i]))
-    edges = list(dict.fromkeys(edges))
-    edges = sorted(edges)
-    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
-    # explicitly
-    print("Format conversion ... ")
-
-    # NetworkX graph
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    # Networkx Jaccard Call
-    print("Solving... ")
-    if benchmark_callable is not None:
-        preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges)
-    else:
-        preds = nx.jaccard_coefficient(Gnx, edges)
-    coeff = []
-    for u, v, p in preds:
-        # FIXME: Use known correct values of WSorensen for few graphs,
-        # hardcode it and compare to Cugraph WSorensen
-        # to get a more robust test
-
-        # Conversion from Networkx Jaccard to Sorensen
-        coeff.append((2 * p) / (1 + p))
-    return coeff
-
-
-# =============================================================================
-# Pytest Fixtures
-# =============================================================================
-@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS)
-def read_csv(request):
-    """
-    Read csv file for both networkx and cugraph
-    """
-    graph_file = request.param
-    dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path)
-
-    return M, graph_file
-
-
-@pytest.mark.sg
-def test_wsorensen(gpubenchmark, read_csv):
-
-    M, graph_file = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_nx_wsorensen_time(gpubenchmark, read_csv):
-
-    M, _ = read_csv
-    networkx_call(M, gpubenchmark)
-
-
-@pytest.mark.sg
-def test_wsorensen_multi_column_weights(gpubenchmark, read_csv):
-
-    M, cu_M = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, cu_M)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_wsorensen_multi_column(read_csv):
-
-    M, _ = read_csv
-
-    cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-    G1 = cugraph.Graph()
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
-
-    G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
-    vertex_pair = vertex_pair[:5]
-
-    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32))
-    weights = cudf.DataFrame()
-    weights["vertex"] = G2.nodes()
-    weights["vertex_"] = weights["vertex"] + 1000
-    weights["weight"] = weight_arr
-
-    df_res = cugraph.sorensen_w(G1, weights, vertex_pair)
-
-    weights = weights[["vertex", "weight"]]
-    df_exp = cugraph.sorensen_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
-
-    # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
-
-
-@pytest.mark.sg
-def test_invalid_datasets_sorensen_w():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
-    with pytest.raises(ValueError):
-        cugraph.sorensen_w(G, None)
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 711652bbae6..45f6de2f663 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -87,6 +87,13 @@
 
 from pylibcugraph.generate_rmat_edgelists import generate_rmat_edgelists
 
+from pylibcugraph.jaccard_coefficients import jaccard_coefficients
+
+from pylibcugraph.overlap_coefficients import overlap_coefficients
+
+from pylibcugraph.sorensen_coefficients import sorensen_coefficients
+
+
 from pylibcugraph import exceptions
 
 __version__ = "23.10.00"
diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py
index 1b93f9322af..6194ace5956 100644
--- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -74,18 +74,17 @@
 
 from pylibcugraph.node2vec import node2vec
 
-node2vec = promoted_experimental_warning_wrapper(node2vec)
 
-from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients
+# from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients
 
-jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients)
+# jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients)
 
-from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients
+# from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients
 
-overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients)
+# overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients)
 
-from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients
+# from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients
 
-sorensen_coefficients = experimental_warning_wrapper(
-    EXPERIMENTAL__sorensen_coefficients
-)
+# sorensen_coefficients = experimental_warning_wrapper(
+# EXPERIMENTAL__sorensen_coefficients
+# )
diff --git a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx
index 805ee821eab..59e94aeb615 100644
--- a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx
+++ b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,8 @@
 # cython: language_level = 3
 
 from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+from cython.operator cimport dereference
 
 from pylibcugraph._cugraph_c.resource_handle cimport (
     bool_t,
@@ -57,7 +59,7 @@ from pylibcugraph.utils cimport (
 )
 
 
-def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle,
+def jaccard_coefficients(ResourceHandle resource_handle,
         _GPUGraph graph,
         first,
         second,
@@ -83,8 +85,10 @@ def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle,
     second :
         Destination of the vertex pair.
     
-    use_weight : bool, optional (default=False)
-        Currently not supported
+    use_weight : bool, optional
+        If set to True, the  compute weighted jaccard_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, computed un-weighted jaccard_coefficients
 
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure
diff --git a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx
index 6af71116469..28360121c64 100644
--- a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx
+++ b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -57,7 +57,7 @@ from pylibcugraph.utils cimport (
 )
 
 
-def EXPERIMENTAL__overlap_coefficients(ResourceHandle resource_handle,
+def overlap_coefficients(ResourceHandle resource_handle,
         _GPUGraph graph,
         first,
         second,
@@ -84,8 +84,10 @@ def EXPERIMENTAL__overlap_coefficients(ResourceHandle resource_handle,
     second :
         Destination of the vertex pair.
     
-    use_weight : bool, optional (default=False)
-        Currently not supported
+    use_weight : bool, optional
+        If set to True, the  compute weighted jaccard_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, computed un-weighted jaccard_coefficients
 
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure
diff --git a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx
index 12647baccb2..983a635012f 100644
--- a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx
+++ b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -57,7 +57,7 @@ from pylibcugraph.utils cimport (
 )
 
 
-def EXPERIMENTAL__sorensen_coefficients(ResourceHandle resource_handle,
+def sorensen_coefficients(ResourceHandle resource_handle,
         _GPUGraph graph,
         first,
         second,
@@ -83,8 +83,10 @@ def EXPERIMENTAL__sorensen_coefficients(ResourceHandle resource_handle,
     second :
         Destination of the vertex pair.
     
-    use_weight : bool, optional (default=False)
-        Currently not supported
+    use_weight : bool, optional
+        If set to True, the  compute weighted jaccard_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, computed un-weighted jaccard_coefficients
 
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure

From db5073da6c69ac3ee44d7130d8799177ec69a0ef Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 21 Sep 2023 13:52:57 -0500
Subject: [PATCH 11/22] Update image names (#3867)

PR updates `rapidsai/ci` references to `rapidsai/ci-conda`

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cugraph/pull/3867
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2d0d58315a0..02b357c7c88 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
+      container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 005fe4a0267..d2d24d90fbe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -63,7 +63,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
+      container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -73,7 +73,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
+      container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibcugraph:
     needs: checks

From a7047e3f0049597b4da625138107830ff78405e5 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Thu, 21 Sep 2023 17:19:30 -0400
Subject: [PATCH 12/22] adding dining preference dataset (#3866)

This dataset is very small, and uses strings as node names. It will be used to test force atlas, and in a new link prediction/similarity notebook.

the licensing is contained here.
http://networkdata.ics.uci.edu/netdata/html/Dining-table_partners.html

Authors:
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - ralph (https://github.com/nv-rliu)

URL: https://github.com/rapidsai/cugraph/pull/3866
---
 python/cugraph/cugraph/datasets/__init__.py   |  1 +
 .../datasets/metadata/dining_prefs.yaml       | 23 +++++++++++++++++++
 python/cugraph/cugraph/testing/__init__.py    |  3 +++
 3 files changed, 27 insertions(+)
 create mode 100644 python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml

diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py
index 7ba274c5960..65a820f108b 100644
--- a/python/cugraph/cugraph/datasets/__init__.py
+++ b/python/cugraph/cugraph/datasets/__init__.py
@@ -27,6 +27,7 @@
 meta_path = Path(__file__).parent / "metadata"
 
 cyber = Dataset(meta_path / "cyber.yaml")
+dining_prefs = Dataset(meta_path / "dining_prefs.yaml")
 dolphins = Dataset(meta_path / "dolphins.yaml")
 email_Eu_core = Dataset(meta_path / "email_Eu_core.yaml")
 karate = Dataset(meta_path / "karate.yaml")
diff --git a/python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml b/python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml
new file mode 100644
index 00000000000..e7ec85d7a1f
--- /dev/null
+++ b/python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml
@@ -0,0 +1,23 @@
+name: dining_prefs
+file_type: .csv
+description:  Classic social networking dataset describes dining preferences for a dormitory in New York state.
+author: J.L. Moreno
+refs:
+  J. L. Moreno (1960). The Sociometry Reader. The Free Press, Glencoe, Illinois, pg.35
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+  - wgt
+col_types:
+  - string
+  - string
+  - int
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 42
+number_of_nodes: 26
+url:  https://data.rapids.ai/cugraph/datasets/dining_prefs.csv
\ No newline at end of file
diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py
index bde398aadbd..f5f0bcb06eb 100644
--- a/python/cugraph/cugraph/testing/__init__.py
+++ b/python/cugraph/cugraph/testing/__init__.py
@@ -23,6 +23,7 @@
 )
 from cugraph.datasets import (
     cyber,
+    dining_prefs,
     dolphins,
     karate,
     karate_disjoint,
@@ -42,6 +43,7 @@
 UNDIRECTED_DATASETS = [karate, dolphins]
 SMALL_DATASETS = [karate, dolphins, polbooks]
 WEIGHTED_DATASETS = [
+    dining_prefs,
     dolphins,
     karate,
     karate_disjoint,
@@ -51,6 +53,7 @@
     small_tree,
 ]
 ALL_DATASETS = [
+    dining_prefs,
     dolphins,
     karate,
     karate_disjoint,

From 367f36cfd4719fb522f12dbb74cec5b8a1e61aa6 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:55:52 -0400
Subject: [PATCH 13/22] Add file to update-version.sh [skip ci] (#3870)

Add a new file to `update-version.sh`. Tested locally

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
   - Jake Awe (https://github.com/AyodeAwe)
---
 ci/release/update-version.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 2c8735079f0..bd3aa6bc370 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -126,3 +126,5 @@ for FILE in .github/workflows/*.yaml; do
   sed_runner "s/dask-cuda.git@branch-[0-9][0-9].[0-9][0-9]/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" python/nx-cugraph/README.md

From f53bb56dc3245f64523aeeb997430c8f49de4624 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Fri, 22 Sep 2023 12:44:31 -0400
Subject: [PATCH 14/22] Fix torch seed in `cugraph-dgl` and `-pyg` tests for
 conv layers (#3869)

Fixes https://github.com/rapidsai/graph_dl/issues/325

Recently, a few CI runs (ex. [1](https://github.com/rapidsai/cugraph/actions/runs/6254253684/job/16983164330?pr=3828#step:7:5078), [2](https://github.com/rapidsai/cugraph/actions/runs/6224345348/job/16896416094?pr=3843)) failed when comparing results from cugraph-ops-based conv layers against results from upstream frameworks. The tests pass most of the time, but occasionally fail due to a combination of using a strict tolerance and bad numerics (floating point error). This PR fixes the seed used for generating random feature tensors so that CI behaves consistently across different runs.

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3869
---
 python/cugraph-dgl/tests/nn/test_gatconv.py       |  2 ++
 python/cugraph-dgl/tests/nn/test_gatv2conv.py     |  2 ++
 python/cugraph-dgl/tests/nn/test_relgraphconv.py  | 15 +++++++++++----
 python/cugraph-dgl/tests/nn/test_sageconv.py      |  1 +
 .../cugraph-dgl/tests/nn/test_transformerconv.py  |  1 +
 .../cugraph_pyg/tests/nn/test_gat_conv.py         |  1 +
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py       |  1 +
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py        |  1 +
 .../cugraph_pyg/tests/nn/test_sage_conv.py        |  1 +
 .../cugraph_pyg/tests/nn/test_transformer_conv.py |  1 +
 10 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index ef3047dc2cd..ce145b2bc87 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -35,6 +35,7 @@ def test_gatconv_equality(
 ):
     from dgl.nn.pytorch import GATConv
 
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if idtype_int:
@@ -121,6 +122,7 @@ def test_gatconv_equality(
 def test_gatconv_edge_feats(
     bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
 ):
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if to_block:
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
index cc46a6e4b39..52003edacca 100644
--- a/python/cugraph-dgl/tests/nn/test_gatv2conv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -35,6 +35,7 @@ def test_gatv2conv_equality(
 ):
     from dgl.nn.pytorch import GATv2Conv
 
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if idtype_int:
@@ -109,6 +110,7 @@ def test_gatv2conv_equality(
 def test_gatv2conv_edge_feats(
     bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
 ):
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if to_block:
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index 901f9ba1433..bdaa89e57f2 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -41,6 +41,7 @@ def test_relgraphconv_equality(
 ):
     from dgl.nn.pytorch import RelGraphConv
 
+    torch.manual_seed(12345)
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
     kwargs = {
@@ -75,12 +76,18 @@ def test_relgraphconv_equality(
             size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
         )
 
-    torch.manual_seed(0)
     conv1 = RelGraphConv(*args, **kwargs).cuda()
+    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).cuda()
 
-    torch.manual_seed(0)
-    kwargs["apply_norm"] = False
-    conv2 = CuGraphRelGraphConv(*args, **kwargs).cuda()
+    with torch.no_grad():
+        if self_loop:
+            conv2.W.data[:-1] = conv1.linear_r.W.data
+            conv2.W.data[-1] = conv1.loop_weight.data
+        else:
+            conv2.W.data = conv1.linear_r.W.data.detach().clone()
+
+        if regularizer is not None:
+            conv2.coeff.data = conv1.linear_r.coeff.data.detach().clone()
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
 
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index e2acf9e6596..b5d0a44b868 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -35,6 +35,7 @@ def test_sageconv_equality(
 ):
     from dgl.nn.pytorch import SAGEConv
 
+    torch.manual_seed(12345)
     kwargs = {"aggregator_type": aggr, "bias": bias}
     g = create_graph1().to("cuda")
 
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index b2b69cb35ab..5ac4fd7bea7 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -41,6 +41,7 @@ def test_transformerconv(
     use_edge_feats,
     sparse_format,
 ):
+    torch.manual_seed(12345)
     device = "cuda"
     g = create_graph1().to(device)
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index 21c43bad38c..62bebb9211d 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -32,6 +32,7 @@ def test_gat_conv_equality(
     import torch
     from torch_geometric.nn import GATConv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index 6b11e87154a..a4794628410 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -28,6 +28,7 @@ def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, req
     import torch
     from torch_geometric.nn import GATv2Conv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
index 233c6aa2836..ded4f300c0c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -31,6 +31,7 @@ def test_rgcn_conv_equality(
     import torch
     from torch_geometric.nn import FastRGCNConv as RGCNConv
 
+    torch.manual_seed(12345)
     in_channels, out_channels, num_relations = (4, 2, 3)
     kwargs = dict(aggr=aggr, bias=bias, num_bases=num_bases, root_weight=root_weight)
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
index 7f73cddbdbb..b2977d1d175 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -32,6 +32,7 @@ def test_sage_conv_equality(
     import torch
     from torch_geometric.nn import SAGEConv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
     csc = CuGraphSAGEConv.to_csc(edge_index, size)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index 7dba1a6d515..fbdb244898b 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -27,6 +27,7 @@ def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
     import torch
     from torch_geometric.nn import TransformerConv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
     csc = CuGraphTransformerConv.to_csc(edge_index, size)

From fe17abc6da469d810ea512d1d887407032613405 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:48:40 -0400
Subject: [PATCH 15/22] cuGraph-PyG Loader Improvements (#3795)

Consolidates various speed improvements tested while running performance benchmarks.  Avoids copying batch data, removes redundant data loading code, simplifies and improves de-offsetting, even though that is now being bypassed entirely for homogeneous graphs.  Removes extra host to device copy.  Properly flips the src/dst columns in the returned `HeteroData` minibatch objects, avoid exposing this to the end user.

I've confirmed this cuts the MFG time by a factor of 4.

Closes #3807

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3795
---
 .../cugraph_pyg/data/cugraph_store.py         | 218 ++++++++++----
 .../cugraph_pyg/loader/cugraph_node_loader.py | 107 +++++--
 .../cugraph-pyg/cugraph_pyg/loader/filter.py  |  57 ----
 .../cugraph_pyg/sampler/cugraph_sampler.py    | 281 +++++++++++-------
 .../tests/mg/test_mg_cugraph_loader.py        |   4 +-
 .../tests/mg/test_mg_cugraph_sampler.py       |  28 +-
 .../tests/mg/test_mg_cugraph_store.py         |   6 +-
 .../cugraph_pyg/tests/test_cugraph_loader.py  | 158 ++++++----
 .../cugraph_pyg/tests/test_cugraph_sampler.py |  28 +-
 .../cugraph_pyg/tests/test_cugraph_store.py   |   2 +-
 10 files changed, 548 insertions(+), 341 deletions(-)
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/filter.py

diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
index 8d5d2fd4894..e0d318adbe0 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -25,6 +25,7 @@
 import pandas
 import cudf
 import cugraph
+import warnings
 
 from cugraph.utilities.utils import import_optional, MissingModule
 
@@ -211,7 +212,9 @@ def __init__(
         F: cugraph.gnn.FeatureStore,
         G: Union[Dict[str, Tuple[TensorType]], Dict[str, int]],
         num_nodes_dict: Dict[str, int],
+        *,
         multi_gpu: bool = False,
+        order: str = "CSC",
     ):
         """
         Constructs a new CuGraphStore from the provided
@@ -256,11 +259,20 @@ def __init__(
         multi_gpu: bool (Optional, default = False)
             Whether the store should be backed by a multi-GPU graph.
             Requires dask to have been set up.
+
+        order: str (Optional ["CSR", "CSC"], default = CSC)
+            The order to use for sampling.  Should nearly always be CSC
+            unless there is a specific expectation of "reverse" sampling.
+            It is also not uncommon to use CSR order for correctness
+            testing, which some cuGraph-PyG tests do.
         """
 
         if None in G:
             raise ValueError("Unspecified edge types not allowed in PyG")
 
+        if order != "CSR" and order != "CSC":
+            raise ValueError("invalid valid for order")
+
         self.__vertex_dtype = torch.int64
 
         self._tensor_attr_cls = CuGraphTensorAttr
@@ -289,6 +301,7 @@ def __init__(
         self.__features = F
         self.__graph = None
         self.__is_graph_owner = False
+        self.__order = order
 
         if construct_graph:
             if multi_gpu:
@@ -297,7 +310,9 @@ def __init__(
                 )
 
             if self.__graph is None:
-                self.__graph = self.__construct_graph(G, multi_gpu=multi_gpu)
+                self.__graph = self.__construct_graph(
+                    G, multi_gpu=multi_gpu, order=order
+                )
                 self.__is_graph_owner = True
 
         self.__subgraphs = {}
@@ -347,6 +362,7 @@ def __construct_graph(
         self,
         edge_info: Dict[Tuple[str, str, str], List[TensorType]],
         multi_gpu: bool = False,
+        order: str = "CSC",
     ) -> cugraph.MultiGraph:
         """
         This function takes edge information and uses it to construct
@@ -363,6 +379,14 @@ def __construct_graph(
         multi_gpu: bool (Optional, default=False)
             Whether to construct a single-GPU or multi-GPU cugraph Graph.
             Defaults to a single-GPU graph.
+
+        order: str (CSC or CSR)
+            Essentially whether to reverse edges so that the cuGraph
+            sampling algorithm operates on the CSC matrix instead of
+            the CSR matrix.  Should nearly always be CSC unless there
+            is a specific expectation of reverse sampling, or correctness
+            testing is being performed.
+
         Returns
         -------
         A newly-constructed directed cugraph.MultiGraph object.
@@ -371,6 +395,9 @@ def __construct_graph(
         # Ensure the original dict is not modified.
         edge_info_cg = {}
 
+        if order != "CSR" and order != "CSC":
+            raise ValueError("Order must be either CSC (default) or CSR!")
+
         # Iterate over the keys in sorted order so that the created
         # numerical types correspond to the lexicographic order
         # of the keys, which is critical to converting the numeric
@@ -430,20 +457,43 @@ def __construct_graph(
 
         df = pandas.DataFrame(
             {
-                "src": pandas.Series(na_src),
-                "dst": pandas.Series(na_dst),
+                "src": pandas.Series(na_dst)
+                if order == "CSC"
+                else pandas.Series(na_src),
+                "dst": pandas.Series(na_src)
+                if order == "CSC"
+                else pandas.Series(na_dst),
                 "etp": pandas.Series(na_etp),
             }
         )
+        vertex_dtype = df.src.dtype
 
         if multi_gpu:
             nworkers = len(distributed.get_client().scheduler_info()["workers"])
-            df = dd.from_pandas(df, npartitions=nworkers).persist()
-            df = df.map_partitions(cudf.DataFrame.from_pandas)
-        else:
-            df = cudf.from_pandas(df)
+            df = dd.from_pandas(df, npartitions=nworkers if len(df) > 32 else 1)
+
+            # Ensure the dataframe is constructed on each partition
+            # instead of adding additional synchronization head from potential
+            # host to device copies.
+            def get_empty_df():
+                return cudf.DataFrame(
+                    {
+                        "src": cudf.Series([], dtype=vertex_dtype),
+                        "dst": cudf.Series([], dtype=vertex_dtype),
+                        "etp": cudf.Series([], dtype="int32"),
+                    }
+                )
 
-        df = df.reset_index(drop=True)
+            # Have to check for empty partitions and handle them appropriately
+            df = df.persist()
+            df = df.map_partitions(
+                lambda f: cudf.DataFrame.from_pandas(f)
+                if len(f) > 0
+                else get_empty_df(),
+                meta=get_empty_df(),
+            ).reset_index(drop=True)
+        else:
+            df = cudf.from_pandas(df).reset_index(drop=True)
 
         graph = cugraph.MultiGraph(directed=True)
         if multi_gpu:
@@ -468,6 +518,10 @@ def __construct_graph(
     def _edge_types_to_attrs(self) -> dict:
         return dict(self.__edge_types_to_attrs)
 
+    @property
+    def order(self) -> str:
+        return self.__order
+
     @property
     def node_types(self) -> List[NodeType]:
         return list(self.__vertex_type_offsets["type"])
@@ -557,6 +611,7 @@ def _get_edge_index(self, attr: CuGraphEdgeAttr) -> Tuple[TensorType, TensorType
             raise ValueError("Graph is not in memory, cannot access edge index!")
 
         if attr.layout != EdgeLayout.COO:
+            # TODO support returning CSR/CSC (Issue #3802)
             raise TypeError("Only COO direct access is supported!")
 
         # Currently, graph creation enforces that input vertex ids are always of
@@ -566,12 +621,14 @@ def _get_edge_index(self, attr: CuGraphEdgeAttr) -> Tuple[TensorType, TensorType
         # This may change in the future if/when renumbering or the graph
         # creation process is refactored.
         # See Issue #3201 for more details.
+        # Also note src/dst are flipped so that cuGraph sampling is done in
+        # CSC format rather than CSR format.
         if self._is_delayed:
-            src_col_name = self.__graph.renumber_map.renumbered_src_col_name
-            dst_col_name = self.__graph.renumber_map.renumbered_dst_col_name
+            dst_col_name = self.__graph.renumber_map.renumbered_src_col_name
+            src_col_name = self.__graph.renumber_map.renumbered_dst_col_name
         else:
-            src_col_name = self.__graph.srcCol
-            dst_col_name = self.__graph.dstCol
+            dst_col_name = self.__graph.srcCol
+            src_col_name = self.__graph.dstCol
 
         # If there is only one edge type (homogeneous graph) then
         # bypass the edge filters for a significant speed improvement.
@@ -785,29 +842,73 @@ def _get_renumbered_edge_groups_from_sample(
         """
         row_dict = {}
         col_dict = {}
-        if len(self.__edge_types_to_attrs) == 1:
+        # If there is only 1 edge type (includes heterogeneous graphs)
+        if len(self.edge_types) == 1:
             t_pyg_type = list(self.__edge_types_to_attrs.values())[0].edge_type
             src_type, _, dst_type = t_pyg_type
 
-            dst_id_table = noi_index[dst_type]
-            dst_id_map = (
-                cudf.Series(cupy.asarray(dst_id_table), name="dst")
-                .reset_index()
-                .rename(columns={"index": "new_id"})
-                .set_index("dst")
-            )
-            dst = dst_id_map["new_id"].loc[sampling_results.destinations]
-            col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
-
-            src_id_table = noi_index[src_type]
-            src_id_map = (
-                cudf.Series(cupy.asarray(src_id_table), name="src")
-                .reset_index()
-                .rename(columns={"index": "new_id"})
-                .set_index("src")
-            )
-            src = src_id_map["new_id"].loc[sampling_results.sources]
-            row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
+            # If there is only 1 node type (homogeneous)
+            # This should only occur if the cuGraph loader was
+            # not used.  This logic is deprecated.
+            if len(self.node_types) == 1:
+                warnings.warn(
+                    "Renumbering after sampling for homogeneous graphs is deprecated.",
+                    FutureWarning,
+                )
+
+                # Create a dataframe mapping old ids to new ids.
+                vtype = src_type
+                id_table = noi_index[vtype]
+                id_map = cudf.Series(
+                    cupy.arange(id_table.shape[0], dtype="int32"),
+                    name="new_id",
+                    index=cupy.asarray(id_table),
+                ).sort_index()
+
+                # Renumber the sources using binary search
+                # Step 1: get the index of the new id
+                ix_r = torch.searchsorted(
+                    torch.as_tensor(id_map.index.values, device="cuda"),
+                    torch.as_tensor(sampling_results.sources.values, device="cuda"),
+                )
+                # Step 2: Go from id indices to actual ids
+                row_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
+                    ix_r
+                ]
+
+                # Renumber the destinations using binary search
+                # Step 1: get the index of the new id
+                ix_c = torch.searchsorted(
+                    torch.as_tensor(id_map.index.values, device="cuda"),
+                    torch.as_tensor(
+                        sampling_results.destinations.values, device="cuda"
+                    ),
+                )
+                # Step 2: Go from id indices to actual ids
+                col_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
+                    ix_c
+                ]
+            else:
+                # Handle the heterogeneous case where there is only 1 edge type
+                dst_id_table = noi_index[dst_type]
+                dst_id_map = cudf.DataFrame(
+                    {
+                        "dst": cupy.asarray(dst_id_table),
+                        "new_id": cupy.arange(dst_id_table.shape[0]),
+                    }
+                ).set_index("dst")
+                dst = dst_id_map["new_id"].loc[sampling_results.destinations]
+                col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
+
+                src_id_table = noi_index[src_type]
+                src_id_map = cudf.DataFrame(
+                    {
+                        "src": cupy.asarray(src_id_table),
+                        "new_id": cupy.arange(src_id_table.shape[0]),
+                    }
+                ).set_index("src")
+                src = src_id_map["new_id"].loc[sampling_results.sources]
+                row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
 
         else:
             # This will retrieve the single string representation.
@@ -822,36 +923,18 @@ def _get_renumbered_edge_groups_from_sample(
 
             for pyg_can_edge_type_str, ix in eoi_types.items():
                 pyg_can_edge_type = tuple(pyg_can_edge_type_str.split("__"))
-                src_type, _, dst_type = pyg_can_edge_type
-
-                # Get the de-offsetted sources
-                sources = torch.as_tensor(
-                    sampling_results.sources.iloc[ix].values, device="cuda"
-                )
-                sources_ix = torch.searchsorted(
-                    self.__vertex_type_offsets["stop"], sources
-                )
-                sources -= self.__vertex_type_offsets["start"][sources_ix]
 
-                # Create the row entry for this type
-                src_id_table = noi_index[src_type]
-                src_id_map = (
-                    cudf.Series(cupy.asarray(src_id_table), name="src")
-                    .reset_index()
-                    .rename(columns={"index": "new_id"})
-                    .set_index("src")
-                )
-                src = src_id_map["new_id"].loc[cupy.asarray(sources)]
-                row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
+                if self.__order == "CSR":
+                    src_type, _, dst_type = pyg_can_edge_type
+                else:  # CSC
+                    dst_type, _, src_type = pyg_can_edge_type
 
                 # Get the de-offsetted destinations
+                dst_num_type = self._numeric_vertex_type_from_name(dst_type)
                 destinations = torch.as_tensor(
                     sampling_results.destinations.iloc[ix].values, device="cuda"
                 )
-                destinations_ix = torch.searchsorted(
-                    self.__vertex_type_offsets["stop"], destinations
-                )
-                destinations -= self.__vertex_type_offsets["start"][destinations_ix]
+                destinations -= self.__vertex_type_offsets["start"][dst_num_type]
 
                 # Create the col entry for this type
                 dst_id_table = noi_index[dst_type]
@@ -864,6 +947,24 @@ def _get_renumbered_edge_groups_from_sample(
                 dst = dst_id_map["new_id"].loc[cupy.asarray(destinations)]
                 col_dict[pyg_can_edge_type] = torch.as_tensor(dst.values, device="cuda")
 
+                # Get the de-offsetted sources
+                src_num_type = self._numeric_vertex_type_from_name(src_type)
+                sources = torch.as_tensor(
+                    sampling_results.sources.iloc[ix].values, device="cuda"
+                )
+                sources -= self.__vertex_type_offsets["start"][src_num_type]
+
+                # Create the row entry for this type
+                src_id_table = noi_index[src_type]
+                src_id_map = (
+                    cudf.Series(cupy.asarray(src_id_table), name="src")
+                    .reset_index()
+                    .rename(columns={"index": "new_id"})
+                    .set_index("src")
+                )
+                src = src_id_map["new_id"].loc[cupy.asarray(sources)]
+                row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
+
         return row_dict, col_dict
 
     def put_tensor(self, tensor, attr) -> None:
@@ -959,9 +1060,7 @@ def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
                 t = t[-1]
 
             if isinstance(t, np.ndarray):
-                t = torch.as_tensor(t, device="cuda")
-            else:
-                t = t.cuda()
+                t = torch.as_tensor(t, device="cpu")
 
             return t
 
@@ -979,7 +1078,6 @@ def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
 
                 t = torch.concatenate([t, u])
 
-            t = t.cuda()
             return t
 
     def _multi_get_tensor(self, attrs: List[CuGraphTensorAttr]) -> List[TensorType]:
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
index 8d79685965f..cf7eb330d67 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -23,12 +23,15 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 
 from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.loader.filter import _filter_cugraph_store
-from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
+from cugraph_pyg.sampler.cugraph_sampler import (
+    _sampler_output_from_sampling_results_heterogeneous,
+    _sampler_output_from_sampling_results_homogeneous,
+)
 
 from typing import Union, Tuple, Sequence, List, Dict
 
 torch_geometric = import_optional("torch_geometric")
+torch = import_optional("torch")
 InputNodes = (
     Sequence
     if isinstance(torch_geometric, MissingModule)
@@ -253,55 +256,97 @@ def __next__(self):
 
             raw_sample_data = cudf.read_parquet(parquet_path)
             if "map" in raw_sample_data.columns:
-                self.__renumber_map = raw_sample_data["map"]
+                num_batches = end_inclusive - self.__start_inclusive + 1
+
+                map_end = raw_sample_data["map"].iloc[num_batches]
+
+                map = torch.as_tensor(
+                    raw_sample_data["map"].iloc[0:map_end], device="cuda"
+                )
                 raw_sample_data.drop("map", axis=1, inplace=True)
+
+                self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
+                self.__renumber_map = map[num_batches + 1 :]
+
             else:
                 self.__renumber_map = None
 
             self.__data = raw_sample_data[list(columns.keys())].astype(columns)
             self.__data.dropna(inplace=True)
 
+            if (
+                len(self.__graph_store.edge_types) == 1
+                and len(self.__graph_store.node_types) == 1
+            ):
+                group_cols = ["batch_id", "hop_id"]
+                self.__data_index = self.__data.groupby(group_cols, as_index=True).agg(
+                    {"sources": "max", "destinations": "max"}
+                )
+                self.__data_index.rename(
+                    columns={"sources": "src_max", "destinations": "dst_max"},
+                    inplace=True,
+                )
+                self.__data_index = self.__data_index.to_dict(orient="index")
+
         # Pull the next set of sampling results out of the dataframe in memory
         f = self.__data["batch_id"] == self.__next_batch
         if self.__renumber_map is not None:
             i = self.__next_batch - self.__start_inclusive
-            ix = self.__renumber_map.iloc[[i, i + 1]]
-            ix_start, ix_end = ix.iloc[0], ix.iloc[1]
-            current_renumber_map = self.__renumber_map.iloc[ix_start:ix_end]
-            if len(current_renumber_map) != ix_end - ix_start:
-                raise ValueError("invalid renumber map")
-        else:
-            current_renumber_map = None
 
-        sampler_output = _sampler_output_from_sampling_results(
-            self.__data[f], current_renumber_map, self.__graph_store
-        )
+            # this should avoid d2h copy
+            current_renumber_map = self.__renumber_map[
+                self.__renumber_map_offsets[i] : self.__renumber_map_offsets[i + 1]
+            ]
 
-        # Get ready for next iteration
-        self.__next_batch += 1
+        else:
+            current_renumber_map = None
 
         # Get and return the sampled subgraph
-        if isinstance(torch_geometric, MissingModule):
-            noi_index, row_dict, col_dict, edge_dict = sampler_output["out"]
-            return _filter_cugraph_store(
-                self.__feature_store,
+        if (
+            len(self.__graph_store.edge_types) == 1
+            and len(self.__graph_store.node_types) == 1
+        ):
+            sampler_output = _sampler_output_from_sampling_results_homogeneous(
+                self.__data[f],
+                current_renumber_map,
                 self.__graph_store,
-                noi_index,
-                row_dict,
-                col_dict,
-                edge_dict,
+                self.__data_index,
+                self.__next_batch,
             )
         else:
-            out = torch_geometric.loader.utils.filter_custom_store(
-                self.__feature_store,
-                self.__graph_store,
-                sampler_output.node,
-                sampler_output.row,
-                sampler_output.col,
-                sampler_output.edge,
+            sampler_output = _sampler_output_from_sampling_results_heterogeneous(
+                self.__data[f], current_renumber_map, self.__graph_store
             )
 
-            return out
+        # Get ready for next iteration
+        self.__next_batch += 1
+
+        # Create a PyG HeteroData object, loading the required features
+        out = torch_geometric.loader.utils.filter_custom_store(
+            self.__feature_store,
+            self.__graph_store,
+            sampler_output.node,
+            sampler_output.row,
+            sampler_output.col,
+            sampler_output.edge,
+        )
+
+        # Account for CSR format in cuGraph vs. CSC format in PyG
+        if self.__graph_store.order == "CSC":
+            for node_type in out.edge_index_dict:
+                out[node_type].edge_index[0], out[node_type].edge_index[1] = (
+                    out[node_type].edge_index[1],
+                    out[node_type].edge_index[0],
+                )
+
+        out.set_value_dict("num_sampled_nodes", sampler_output.num_sampled_nodes)
+        out.set_value_dict("num_sampled_edges", sampler_output.num_sampled_edges)
+
+        return out
+
+    @property
+    def _starting_batch_id(self):
+        return self.__starting_batch_id
 
     def __iter__(self):
         return self
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/filter.py b/python/cugraph-pyg/cugraph_pyg/loader/filter.py
deleted file mode 100644
index f519ba7cfc9..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/filter.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cupy
-
-from cugraph_pyg.data import CuGraphStore
-
-from typing import (
-    Dict,
-    Sequence,
-)
-
-
-def _filter_cugraph_store(
-    feature_store: CuGraphStore,
-    graph_store: CuGraphStore,
-    node_dict: Dict[str, Sequence],
-    row_dict: Dict[str, Sequence],
-    col_dict: Dict[str, Sequence],
-    edge_dict: Dict[str, Sequence],
-) -> dict:
-    """
-    Primarily for testing without torch and torch_geometric.
-    Returns a dictionary containing the sampled subgraph.
-    """
-    data = {}
-
-    for attr in graph_store.get_all_edge_attrs():
-        key = attr.edge_type
-        if key in row_dict and key in col_dict:
-            edge_index = cupy.stack([row_dict[key], col_dict[key]])
-            data[attr.edge_type] = {}
-            data[attr.edge_type]["edge_index"] = edge_index
-
-    # Filter node storage:
-    required_attrs = []
-    for attr in feature_store.get_all_tensor_attrs():
-        if attr.group_name in node_dict:
-            attr.index = node_dict[attr.group_name]
-            required_attrs.append(attr)
-            data[attr.group_name] = {}
-            data["num_nodes"] = attr.index.size
-    tensors = feature_store.multi_get_tensor(required_attrs)
-    for i, attr in enumerate(required_attrs):
-        data[attr.group_name][attr.attr_name] = tensors[i]
-
-    return data
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index d4f600006be..6e8c4322418 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -12,26 +12,21 @@
 # limitations under the License.
 
 
-from typing import Sequence
+from typing import Sequence, Dict, Tuple
 
 from cugraph_pyg.data import CuGraphStore
 
-from cugraph.utilities.utils import import_optional, MissingModule
+from cugraph.utilities.utils import import_optional
 import cudf
 
 dask_cudf = import_optional("dask_cudf")
 torch_geometric = import_optional("torch_geometric")
 
 torch = import_optional("torch")
+HeteroSamplerOutput = torch_geometric.sampler.base.HeteroSamplerOutput
 
-HeteroSamplerOutput = (
-    None
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.sampler.base.HeteroSamplerOutput
-)
 
-
-def _count_unique_nodes(
+def _get_unique_nodes(
     sampling_results: cudf.DataFrame,
     graph_store: CuGraphStore,
     node_type: str,
@@ -54,8 +49,8 @@ def _count_unique_nodes(
 
     Returns
     -------
-    int
-        The number of unique nodes of the given node type.
+    cudf.Series
+        The unique nodes of the given node type.
     """
     if node_position == "src":
         edge_index = "sources"
@@ -78,12 +73,111 @@ def _count_unique_nodes(
 
         sampling_results_node = sampling_results[f]
     else:
-        return 0
+        return cudf.Series([], dtype="int64")
 
-    return sampling_results_node[edge_index].nunique()
+    return sampling_results_node[edge_index]
 
 
-def _sampler_output_from_sampling_results(
+def _sampler_output_from_sampling_results_homogeneous(
+    sampling_results: cudf.DataFrame,
+    renumber_map: torch.Tensor,
+    graph_store: CuGraphStore,
+    data_index: Dict[Tuple[int, int], Dict[str, int]],
+    batch_id: int,
+    metadata: Sequence = None,
+) -> HeteroSamplerOutput:
+    """
+    Parameters
+    ----------
+    sampling_results: cudf.DataFrame
+        The dataframe containing sampling results.
+    renumber_map: torch.Tensor
+        The tensor containing the renumber map, or None if there
+        is no renumber map.
+    graph_store: CuGraphStore
+        The graph store containing the structure of the sampled graph.
+    data_index: Dict[Tuple[int, int], Dict[str, int]]
+        Dictionary where keys are the batch id and hop id,
+        and values are dictionaries containing the max src
+        and max dst node ids for the batch and hop.
+    batch_id: int
+        The current batch id, whose samples are being retrieved
+        from the sampling results and data index.
+    metadata: Tensor
+        The metadata for the sampled batch.
+
+    Returns
+    -------
+    HeteroSamplerOutput
+    """
+
+    if len(graph_store.edge_types) > 1 or len(graph_store.node_types) > 1:
+        raise ValueError("Graph is heterogeneous")
+
+    hops = torch.arange(
+        sampling_results.hop_id.iloc[len(sampling_results) - 1] + 1, device="cuda"
+    )
+    hops = torch.searchsorted(
+        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
+    )
+
+    node_type = graph_store.node_types[0]
+    edge_type = graph_store.edge_types[0]
+
+    num_nodes_per_hop_dict = {node_type: torch.zeros(len(hops) + 1, dtype=torch.int64)}
+    num_edges_per_hop_dict = {edge_type: torch.zeros(len(hops), dtype=torch.int64)}
+
+    if renumber_map is None:
+        raise ValueError("Renumbered input is expected for homogeneous graphs")
+
+    noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
+
+    row_dict = {
+        edge_type: torch.as_tensor(sampling_results.sources, device="cuda"),
+    }
+
+    col_dict = {
+        edge_type: torch.as_tensor(sampling_results.destinations, device="cuda"),
+    }
+
+    num_nodes_per_hop_dict[node_type][0] = data_index[batch_id, 0]["src_max"] + 1
+    for hop in range(len(hops)):
+        hop_ix_start = hops[hop]
+        hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
+
+        if num_nodes_per_hop_dict[node_type][hop] > 0:
+            max_id_hop = data_index[batch_id, hop]["dst_max"]
+            max_id_prev_hop = (
+                data_index[batch_id, hop - 1]["dst_max"]
+                if hop > 0
+                else data_index[batch_id, 0]["src_max"]
+            )
+
+            if max_id_hop > max_id_prev_hop:
+                num_nodes_per_hop_dict[node_type][hop + 1] = (
+                    max_id_hop - max_id_prev_hop
+                )
+            else:
+                num_nodes_per_hop_dict[node_type][hop + 1] = 0
+        # will default to 0 if the previous hop was 0, since this is a PyG requirement
+
+        num_edges_per_hop_dict[edge_type][hop] = hop_ix_end - hop_ix_start
+
+    if HeteroSamplerOutput is None:
+        raise ImportError("Error importing from pyg")
+
+    return HeteroSamplerOutput(
+        node=noi_index,
+        row=row_dict,
+        col=col_dict,
+        edge=None,
+        num_sampled_nodes=num_nodes_per_hop_dict,
+        num_sampled_edges=num_edges_per_hop_dict,
+        metadata=metadata,
+    )
+
+
+def _sampler_output_from_sampling_results_heterogeneous(
     sampling_results: cudf.DataFrame,
     renumber_map: cudf.Series,
     graph_store: CuGraphStore,
@@ -109,7 +203,7 @@ def _sampler_output_from_sampling_results(
 
     hops = torch.arange(sampling_results.hop_id.max() + 1, device="cuda")
     hops = torch.searchsorted(
-        torch.as_tensor(sampling_results.hop_id.values, device="cuda"), hops
+        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
     )
 
     num_nodes_per_hop_dict = {}
@@ -119,13 +213,11 @@ def _sampler_output_from_sampling_results(
     sampling_results_hop_0 = sampling_results.iloc[
         0 : (hops[1] if len(hops) > 1 else len(sampling_results))
     ]
+
     for node_type in graph_store.node_types:
-        if len(graph_store.node_types) == 1:
-            num_unique_nodes = sampling_results_hop_0.sources.nunique()
-        else:
-            num_unique_nodes = _count_unique_nodes(
-                sampling_results_hop_0, graph_store, node_type, "src"
-            )
+        num_unique_nodes = _get_unique_nodes(
+            sampling_results_hop_0, graph_store, node_type, "src"
+        ).nunique()
 
         if num_unique_nodes > 0:
             num_nodes_per_hop_dict[node_type] = torch.zeros(
@@ -134,112 +226,87 @@ def _sampler_output_from_sampling_results(
             num_nodes_per_hop_dict[node_type][0] = num_unique_nodes
 
     if renumber_map is not None:
-        if len(graph_store.node_types) > 1 or len(graph_store.edge_types) > 1:
-            raise ValueError(
-                "Precomputing the renumber map is currently "
-                "unsupported for heterogeneous graphs."
-            )
+        raise ValueError(
+            "Precomputing the renumber map is currently "
+            "unsupported for heterogeneous graphs."
+        )
 
-        node_type = graph_store.node_types[0]
-        if not isinstance(node_type, str):
-            raise ValueError("Node types must be strings")
-        noi_index = {node_type: torch.as_tensor(renumber_map.values, device="cuda")}
-
-        edge_type = graph_store.edge_types[0]
-        if (
-            not isinstance(edge_type, tuple)
-            or not isinstance(edge_type[0], str)
-            or len(edge_type) != 3
-        ):
-            raise ValueError("Edge types must be 3-tuples of strings")
-        if edge_type[0] != node_type or edge_type[2] != node_type:
-            raise ValueError("Edge src/dst type must match for homogeneous graphs")
-        row_dict = {
-            edge_type: torch.as_tensor(sampling_results.sources.values, device="cuda"),
-        }
-        col_dict = {
-            edge_type: torch.as_tensor(
-                sampling_results.destinations.values, device="cuda"
+    # Calculate nodes of interest based on unique nodes in order of appearance
+    # Use hop 0 sources since those are the only ones not included in destinations
+    # Use torch.concat based on benchmark performance (vs. cudf.concat)
+
+    if sampling_results_hop_0 is None:
+        sampling_results_hop_0 = sampling_results.iloc[
+            0 : (hops[1] if len(hops) > 1 else len(sampling_results))
+        ]
+
+    nodes_of_interest = (
+        cudf.Series(
+            torch.concat(
+                [
+                    torch.as_tensor(sampling_results_hop_0.sources, device="cuda"),
+                    torch.as_tensor(sampling_results.destinations, device="cuda"),
+                ]
             ),
-        }
-    else:
-        # Calculate nodes of interest based on unique nodes in order of appearance
-        # Use hop 0 sources since those are the only ones not included in destinations
-        # Use torch.concat based on benchmark performance (vs. cudf.concat)
-        nodes_of_interest = (
-            cudf.Series(
-                torch.concat(
-                    [
-                        torch.as_tensor(
-                            sampling_results_hop_0.sources.values, device="cuda"
-                        ),
-                        torch.as_tensor(
-                            sampling_results.destinations.values, device="cuda"
-                        ),
-                    ]
-                ),
-                name="nodes_of_interest",
-            )
-            .drop_duplicates()
-            .sort_index()
+            name="nodes_of_interest",
         )
-        del sampling_results_hop_0
+        .drop_duplicates()
+        .sort_index()
+    )
 
-        # Get the grouped node index (for creating the renumbered grouped edge index)
-        noi_index = graph_store._get_vertex_groups_from_sample(
-            torch.as_tensor(nodes_of_interest.values, device="cuda")
-        )
-        del nodes_of_interest
+    # Get the grouped node index (for creating the renumbered grouped edge index)
+    noi_index = graph_store._get_vertex_groups_from_sample(
+        torch.as_tensor(nodes_of_interest, device="cuda")
+    )
+    del nodes_of_interest
 
-        # Get the new edge index (by type as expected for HeteroData)
-        # FIXME handle edge ids/types after the C++ updates
-        row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-            sampling_results, noi_index
-        )
+    # Get the new edge index (by type as expected for HeteroData)
+    # FIXME handle edge ids/types after the C++ updates
+    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
+        sampling_results, noi_index
+    )
 
     for hop in range(len(hops)):
         hop_ix_start = hops[hop]
         hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
-        sampling_results_hop = sampling_results.iloc[hop_ix_start:hop_ix_end]
+        sampling_results_to_hop = sampling_results.iloc[0:hop_ix_end]
 
         for node_type in graph_store.node_types:
-            if len(graph_store.node_types) == 1:
-                num_unique_nodes = sampling_results_hop.destinations.nunique()
-            else:
-                num_unique_nodes = _count_unique_nodes(
-                    sampling_results_hop, graph_store, node_type, "dst"
-                )
+            unique_nodes_hop = _get_unique_nodes(
+                sampling_results_to_hop, graph_store, node_type, "dst"
+            )
+
+            unique_nodes_0 = _get_unique_nodes(
+                sampling_results_hop_0, graph_store, node_type, "src"
+            )
+
+            num_unique_nodes = cudf.concat([unique_nodes_0, unique_nodes_hop]).nunique()
 
             if num_unique_nodes > 0:
                 if node_type not in num_nodes_per_hop_dict:
                     num_nodes_per_hop_dict[node_type] = torch.zeros(
                         len(hops) + 1, dtype=torch.int64
                     )
-                num_nodes_per_hop_dict[node_type][hop + 1] = num_unique_nodes
+                num_nodes_per_hop_dict[node_type][hop + 1] = num_unique_nodes - int(
+                    num_nodes_per_hop_dict[node_type][: hop + 1].sum(0)
+                )
 
-        if len(graph_store.edge_types) == 1:
-            edge_type = graph_store.edge_types[0]
-            if edge_type not in num_edges_per_hop_dict:
-                num_edges_per_hop_dict[edge_type] = torch.zeros(
+        numeric_etypes, counts = torch.unique(
+            torch.as_tensor(
+                sampling_results.iloc[hop_ix_start:hop_ix_end].edge_type,
+                device="cuda",
+            ),
+            return_counts=True,
+        )
+        numeric_etypes = list(numeric_etypes)
+        counts = list(counts)
+        for num_etype, count in zip(numeric_etypes, counts):
+            can_etype = graph_store.numeric_edge_type_to_canonical(num_etype)
+            if can_etype not in num_edges_per_hop_dict:
+                num_edges_per_hop_dict[can_etype] = torch.zeros(
                     len(hops), dtype=torch.int64
                 )
-            num_edges_per_hop_dict[graph_store.edge_types[0]][hop] = len(
-                sampling_results_hop
-            )
-        else:
-            numeric_etypes, counts = torch.unique(
-                torch.as_tensor(sampling_results_hop.edge_type.values, device="cuda"),
-                return_counts=True,
-            )
-            numeric_etypes = list(numeric_etypes)
-            counts = list(counts)
-            for num_etype, count in zip(numeric_etypes, counts):
-                can_etype = graph_store.numeric_edge_type_to_canonical(num_etype)
-                if can_etype not in num_edges_per_hop_dict:
-                    num_edges_per_hop_dict[can_etype] = torch.zeros(
-                        len(hops), dtype=torch.int64
-                    )
-                num_edges_per_hop_dict[can_etype][hop] = count
+            num_edges_per_hop_dict[can_etype][hop] = count
 
     if HeteroSamplerOutput is None:
         raise ImportError("Error importing from pyg")
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
index e29f3aea512..55aebf305da 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
@@ -24,7 +24,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_basic(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
@@ -52,7 +52,7 @@ def test_cugraph_loader_basic(dask_client, karate_gnn):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_hetero(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
index 550852a3303..a1a72a44d0c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -17,7 +17,9 @@
 import pytest
 
 from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
+from cugraph_pyg.sampler.cugraph_sampler import (
+    _sampler_output_from_sampling_results_heterogeneous,
+)
 
 from cugraph.gnn import FeatureStore
 
@@ -31,7 +33,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample(dask_client, basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -56,7 +58,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
         .sort_values(by=["sources", "destinations"])
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -84,7 +86,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"].tolist() == [4, 4]
+    assert out.num_sampled_nodes["vt1"].tolist() == [4, 1]
 
     assert len(out.num_sampled_edges) == 1
     assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
@@ -95,7 +97,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 @pytest.mark.skip(reason="broken")
 def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -119,7 +121,7 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
         .compute()
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -144,8 +146,8 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"].tolist() == [2, 2]
-    assert out.num_sampled_nodes["brown"].tolist() == [3, 2]
+    assert out.num_sampled_nodes["black"].tolist() == [2, 0]
+    assert out.num_sampled_nodes["brown"].tolist() == [3, 0]
 
     assert len(out.num_sampled_edges) == 5
     assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
@@ -186,7 +188,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
         torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
     )
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -198,7 +200,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
         }
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         mock_sampling_results, None, graph_store, None
     )
 
@@ -218,9 +220,9 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
     assert out.col[("B", "ba", "A")].tolist() == [1, 1]
 
     assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 1, 0, 1]
-    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 1, 0]
-    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 2]
+    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 0, 0, 0]
+    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 0, 0]
+    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
     assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index a5a59623710..43b1e5da5a0 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -117,8 +117,8 @@ def test_get_edge_index(graph, edge_index_type, dask_client):
             G[et][1] = cudf.Series(G[et][1])
     elif edge_index_type == "dask-cudf":
         for et in list(G.keys()):
-            G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=2)
-            G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=2)
+            G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=1)
+            G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=1)
 
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
@@ -215,7 +215,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(
 def test_renumber_edges(abc_graph, dask_client):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 620f1a5eb85..48a21cb7fd6 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -26,12 +26,14 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 
 torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+trim_to_layer = import_optional("torch_geometric.utils.trim_to_layer")
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_basic(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
@@ -57,7 +59,7 @@ def test_cugraph_loader_basic(karate_gnn):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_hetero(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
@@ -82,23 +84,29 @@ def test_cugraph_loader_hetero(karate_gnn):
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_from_disk():
+    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
+    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
+    x = torch.zeros(256, dtype=torch.int32)
+    x[torch.tensor(m, dtype=torch.int32)] = n
     F = FeatureStore()
-    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+    F.add_data(x, "t0", "x")
 
-    G = {("t0", "knows", "t0"): 7}
-    N = {"t0": 7}
+    G = {("t0", "knows", "t0"): 9080}
+    N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6],
-            "destinations": [6, 4, 3, 2, 2, 1, 5],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
+            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
         }
     )
+    map = cudf.Series(m, name="map")
+    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
 
     tempdir = tempfile.TemporaryDirectory()
     for s in range(256):
@@ -115,32 +123,49 @@ def test_cugraph_loader_from_disk():
     for sample in loader:
         num_samples += 1
         assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
-        assert sample["t0"]["x"].tolist() == [1, 2, 3, 7, 5, 4, 6]
-        assert list(sample[("t0", "knows", "t0")]["edge_index"].shape) == [2, 7]
+        # correct vertex order is [0, 1, 2, 5, 4, 3, 6]; x = [1, 2, 3, 6, 5, 4, 7]
+        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
+
+        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
+        assert list(edge_index.shape) == [2, 8]
+
+        assert (
+            edge_index[0].tolist()
+            == bogus_samples.sources.dropna().values_host.tolist()
+        )
+        assert (
+            edge_index[1].tolist()
+            == bogus_samples.destinations.dropna().values_host.tolist()
+        )
 
     assert num_samples == 256
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_from_disk_subset():
+    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
+    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
+    x = torch.zeros(256, dtype=torch.int32)
+    x[torch.tensor(m, dtype=torch.int32)] = n
     F = FeatureStore()
-    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+    F.add_data(x, "t0", "x")
 
-    G = {("t0", "knows", "t0"): 7}
-    N = {"t0": 7}
+    G = {("t0", "knows", "t0"): 9080}
+    N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6],
-            "destinations": [6, 4, 3, 2, 2, 1, 5],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
+            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
         }
     )
+    map = cudf.Series(m, name="map")
+    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
 
     tempdir = tempfile.TemporaryDirectory()
     for s in range(256):
@@ -159,33 +184,45 @@ def test_cugraph_loader_from_disk_subset():
         num_samples += 1
         assert sample["t0"]["num_nodes"] == 7
         # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
-        assert sample["t0"]["x"].tolist() == [1, 2, 3, 7, 5, 4, 6]
-        assert list(sample[("t0", "knows", "t0")]["edge_index"].shape) == [2, 7]
+        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
+
+        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
+        assert list(edge_index.shape) == [2, 8]
+
+        assert (
+            edge_index[0].tolist()
+            == bogus_samples.sources.dropna().values_host.tolist()
+        )
+        assert (
+            edge_index[1].tolist()
+            == bogus_samples.destinations.dropna().values_host.tolist()
+        )
 
     assert num_samples == 100
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_cugraph_loader_from_disk_subset_renumbered():
+def test_cugraph_loader_e2e_coo():
+    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
+    x = torch.randint(3000, (256, 256)).to(torch.float32)
     F = FeatureStore()
-    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+    F.add_data(x, "t0", "x")
 
-    G = {("t0", "knows", "t0"): 7}
-    N = {"t0": 7}
+    G = {("t0", "knows", "t0"): 9999}
+    N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6],
-            "destinations": [6, 4, 3, 2, 2, 1, 5],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
+            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
         }
     )
-
-    map = cudf.Series([2, 9, 0, 2, 1, 3, 4, 6, 5], name="map")
+    map = cudf.Series(m, name="map")
     bogus_samples = bogus_samples.join(map, how="outer").sort_index()
 
     tempdir = tempfile.TemporaryDirectory()
@@ -200,22 +237,35 @@ def test_cugraph_loader_from_disk_subset_renumbered():
         input_files=list(os.listdir(tempdir.name))[100:200],
     )
 
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 2, 1, 3, 4, 6, 5]; x = [1, 3, 2, 4, 5, 7, 6]
-        assert sample["t0"]["x"].tolist() == [1, 3, 2, 4, 5, 7, 6]
+    convs = [
+        torch_geometric.nn.SAGEConv(256, 64, aggr="mean").cuda(),
+        torch_geometric.nn.SAGEConv(64, 8, aggr="mean").cuda(),
+        torch_geometric.nn.SAGEConv(8, 1, aggr="mean").cuda(),
+    ]
 
-        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
-        assert list(edge_index.shape) == [2, 7]
-        assert (
-            edge_index[0].tolist()
-            == bogus_samples.sources.dropna().values_host.tolist()
-        )
-        assert (
-            edge_index[1].tolist()
-            == bogus_samples.destinations.dropna().values_host.tolist()
-        )
+    trim = trim_to_layer.TrimToLayer()
+    relu = torch.nn.functional.relu
+    dropout = torch.nn.functional.dropout
 
-    assert num_samples == 100
+    for hetero_data in loader:
+        ei = hetero_data["t0", "knows", "t0"]["edge_index"]
+        x = hetero_data["t0"]["x"].cuda()
+        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
+        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
+
+        print(num_sampled_nodes, num_sampled_edges)
+
+        for i in range(len(convs)):
+            x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
+
+            s = x.shape[0]
+
+            x = convs[i](x, ei, size=(s, s))
+            x = relu(x)
+            x = dropout(x, p=0.5)
+            print(x.shape)
+
+        print(x.shape)
+        x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])
+
+        assert list(x.shape) == [3, 1]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index 08a8625b33b..84f62e80c9d 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -17,7 +17,9 @@
 import pytest
 
 from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
+from cugraph_pyg.sampler.cugraph_sampler import (
+    _sampler_output_from_sampling_results_heterogeneous,
+)
 
 from cugraph.utilities.utils import import_optional, MissingModule
 from cugraph import uniform_neighbor_sample
@@ -29,7 +31,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample(basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -49,7 +51,7 @@ def test_neighbor_sample(basic_graph_1):
         return_offsets=False,
     ).sort_values(by=["sources", "destinations"])
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -77,7 +79,7 @@ def test_neighbor_sample(basic_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"].tolist() == [4, 4]
+    assert out.num_sampled_nodes["vt1"].tolist() == [4, 1]
 
     assert len(out.num_sampled_edges) == 1
     assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
@@ -87,7 +89,7 @@ def test_neighbor_sample(basic_graph_1):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -107,7 +109,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
         with_batch_ids=True,
     ).sort_values(by=["sources", "destinations"])
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -132,8 +134,8 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"].tolist() == [2, 2]
-    assert out.num_sampled_nodes["brown"].tolist() == [3, 2]
+    assert out.num_sampled_nodes["black"].tolist() == [2, 0]
+    assert out.num_sampled_nodes["brown"].tolist() == [3, 0]
 
     assert len(out.num_sampled_edges) == 5
     assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
@@ -147,7 +149,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 def test_neighbor_sample_mock_sampling_results(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N)
+    graph_store = CuGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -159,7 +161,7 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
         }
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         mock_sampling_results, None, graph_store, None
     )
 
@@ -179,9 +181,9 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     assert out.col[("B", "ba", "A")].tolist() == [1, 1]
 
     assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 1, 0, 1]
-    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 1, 0]
-    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 2]
+    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 0, 0, 0]
+    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 0, 0]
+    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
     assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
index 289dd69a829..e815b813050 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
@@ -199,7 +199,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph
 def test_renumber_edges(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N)
+    graph_store = CuGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(

From 9c96b2613f029eb9616c19aefb38a689c1267bae Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Mon, 25 Sep 2023 07:49:19 -0700
Subject: [PATCH 16/22] Update dgl benchmarks (#3775)

This PR adds upstream DGL benchmarks , I will expand this to add `cugraph-dgl` soon.

<img width="1632" alt="image" src="https://github.com/rapidsai/cugraph/assets/4837571/f3908d30-a0ba-4c4d-a1d0-f06fe15e160f">

CC: @tingyu66 , @BradReesWork

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3775
---
 .../cugraph-dgl/scale-benchmarks/__init__.py  |   0
 .../scale-benchmarks/dgl_benchmark.py         | 152 ++++++++++++++++++
 .../scale-benchmarks/load_graph_feats.py      | 123 ++++++++++++++
 .../cugraph-dgl/scale-benchmarks/model.py     | 110 +++++++++++++
 4 files changed, 385 insertions(+)
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/__init__.py
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/model.py

diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/__init__.py b/benchmarks/cugraph-dgl/scale-benchmarks/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py b/benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py
new file mode 100644
index 00000000000..3762226d570
--- /dev/null
+++ b/benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import time
+import dgl
+from dgl.dataloading import MultiLayerNeighborSampler, DataLoader
+import pandas as pd
+import torch
+from model import run_1_epoch
+from argparse import ArgumentParser
+from load_graph_feats import load_edges_from_disk, load_node_labels, load_node_features
+
+class DataLoaderArgs:
+    def __init__(self, args):
+        self.dataset_path = args.dataset_path
+        self.replication_factors = [int(x) for x in args.replication_factors.split(",")]
+        self.fanouts = [[int(y) for y in x.split("_")] for x in args.fanouts.split(",")]
+        self.batch_sizes = [int(x) for x in args.batch_sizes.split(",")]
+        self.use_uva = not args.do_not_use_uva
+
+
+
+def create_dataloader(g, train_idx, batch_size, fanouts, use_uva):
+    print("Creating dataloader", flush=True)
+    st = time.time()
+    if use_uva:
+        train_idx = {k: v.to("cuda") for k, v in train_idx.items()}
+    sampler = MultiLayerNeighborSampler(fanouts=fanouts)
+    dataloader = DataLoader(
+        g,
+        train_idx,
+        sampler,
+        num_workers=0,
+        batch_size=batch_size,
+        use_uva=use_uva,
+        shuffle=False,
+        drop_last=False,
+    )
+    et = time.time()
+    print(f"Time to create dataloader = {et - st:.2f} seconds", flush=True)
+    return dataloader
+
+
+
+def create_dgl_graph_from_disk(dataset_path, replication_factor=1):
+    """
+    Create a DGL graph from a dataset on disk.
+    Args:
+        dataset_path: Path to the dataset on disk.
+        replication_factor: Number of times to replicate the edges.
+    Returns:
+        DGLGraph: DGLGraph with the loaded dataset.
+    """
+    with open(os.path.join(dataset_path, "meta.json"), "r") as f:
+        input_meta = json.load(f)
+
+    parquet_path = os.path.join(dataset_path, "parquet")
+    graph_data = load_edges_from_disk(
+        parquet_path, replication_factor, input_meta
+    )
+    label_data = load_node_labels(dataset_path, replication_factor, input_meta)
+    if replication_factor <8 :
+        feat_data = load_node_features(dataset_path, replication_factor, node_type='paper')
+    else:
+        feat_data = None   
+    print("labels and features loaded ", flush=True)
+
+    g = dgl.heterograph(graph_data)
+
+    return g, label_data, feat_data
+
+
+def main(args):
+    print(f"Running dgl dataloading benchmark with the following parameters:\n"
+          f"Dataset path = {args.dataset_path}\n"
+          f"Replication factors = {args.replication_factors}\n"
+          f"Fanouts = {args.fanouts}\n"
+          f"Batch sizes = {args.batch_sizes}\n"
+          f"Use UVA = {args.use_uva}\n"
+          f"{'=' * 30}")
+
+    time_ls = []
+    for replication_factor in args.replication_factors:
+        start_time = time.time()
+        g, label_data, feat_data = create_dgl_graph_from_disk(args.dataset_path, replication_factor)
+        elapsed_time = time.time() - start_time
+
+        print(f"Replication factor = {replication_factor}\n"
+              f"G has {g.num_edges():,} edges and took {elapsed_time:.2f} seconds to load", flush=True)
+
+        train_idx = {"paper": label_data["paper"]["train_idx"]}
+        y = label_data["paper"]["y"]
+        r_time_ls = e2e_benchmark(g, feat_data, y, train_idx, args.fanouts, args.batch_sizes, use_uva=args.use_uva)
+        [x.update({"replication_factor": replication_factor}) for x in r_time_ls]
+        [x.update({"num_edges":  g.num_edges()}) for x in r_time_ls]
+        time_ls.extend(r_time_ls)
+
+        print(f"Benchmark completed for replication factor = {replication_factor}\n{'=' * 30}", flush=True)
+
+    df = pd.DataFrame(time_ls)
+    df.to_csv("dgl_e2e_benchmark.csv", index=False)
+    print(f"Benchmark completed for all replication factors\n{'=' * 30}", flush=True)
+
+
+def e2e_benchmark(g, feat, y, train_idx, fanouts, batch_sizes, use_uva):
+    """
+    Run the e2e_benchmark
+    Args:
+        g: DGLGraph
+        feat: Tensor containing the features.
+        y: Tensor containing the labels.
+        train_idx: Tensor containing the training indices.
+        fanouts: List of fanouts to use for the dataloader.
+        batch_sizes: List of batch sizes to use for the dataloader.
+        use_uva: Whether to use unified virtual address space.
+        model_backend: Backend of model to use.
+    """
+    time_ls = []
+    for fanout in fanouts:
+        for batch_size in batch_sizes:
+            dataloader = create_dataloader(g, train_idx, batch_size, fanout, use_uva)
+            time_d = run_1_epoch(dataloader, feat, y, fanout, batch_size, model_backend='dgl')
+            time_ls.append(time_d)
+            print("="*30)
+    return time_ls
+
+
+
+def parse_arguments():
+    parser = ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, default="/raid/vjawa/ogbn_papers100M")
+    parser.add_argument("--replication_factors", type=str, default="2")
+    parser.add_argument("--fanouts", type=str, default="10_10_10")
+    parser.add_argument("--batch_sizes", type=str, default="512,1024,8192,16384")
+    parser.add_argument("--do_not_use_uva", action="store_true")
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    arguments = parse_arguments()
+    main(DataLoaderArgs(arguments))
diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py b/benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py
new file mode 100644
index 00000000000..4f0f81c70e1
--- /dev/null
+++ b/benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import torch
+import os
+
+
+def load_edges_from_disk(parquet_path, replication_factor, input_meta):
+    """
+    Load the edges from disk into a graph data dictionary.
+    Args:
+        parquet_path: Path to the parquet directory.
+        replication_factor: Number of times to replicate the edges.
+        input_meta: Input meta data.
+    Returns:
+        dict: Dictionary of edge types to a tuple of (src, dst)
+    """
+    graph_data = {}
+
+    for edge_type in input_meta["num_edges"].keys():
+        print(f"Loading edge index for edge type {edge_type} for replication factor = {replication_factor}")
+
+        canonical_edge_type = tuple(edge_type.split("__"))
+        edge_index = pd.read_parquet(os.path.join(parquet_path, edge_type, "edge_index.parquet"))
+        edge_index = {
+            "src": torch.from_numpy(edge_index.src.values),
+            "dst": torch.from_numpy(edge_index.dst.values),
+        }
+
+        if replication_factor > 1:
+            src_list, dst_list = replicate_edges(edge_index, canonical_edge_type, replication_factor, input_meta)
+            edge_index["src"] = torch.cat(src_list).contiguous()
+            edge_index["dst"] = torch.cat(dst_list).contiguous()
+
+        graph_data[canonical_edge_type] = edge_index["src"], edge_index["dst"]
+
+    print("Read Edge Data")
+    return graph_data
+
+
+def replicate_edges(edge_index, canonical_edge_type, replication_factor, input_meta):
+    src_list = [edge_index["src"]]
+    dst_list = [edge_index["dst"]]
+
+    for r in range(1, replication_factor):
+        new_src = edge_index["src"] + (r * input_meta["num_nodes"][canonical_edge_type[0]])
+        new_dst = edge_index["dst"] + (r * input_meta["num_nodes"][canonical_edge_type[2]])
+        src_list.append(new_src)
+        dst_list.append(new_dst)
+
+    return src_list, dst_list
+
+
+
+
+def load_node_labels(dataset_path, replication_factor, input_meta):
+    num_nodes_dict = {node_type: t * replication_factor for node_type, t in input_meta["num_nodes"].items()}
+    node_data = {}
+
+    for node_type in input_meta["num_nodes"].keys():
+        node_data[node_type] = {}
+        label_path = os.path.join(dataset_path, "parquet", node_type, "node_label.parquet")
+
+        if os.path.exists(label_path):
+            node_data[node_type] = process_node_label(label_path, node_type, replication_factor, num_nodes_dict, input_meta)
+
+        else:
+            node_data[node_type]["num_nodes"] = num_nodes_dict[node_type]
+
+    print("Loaded node labels", flush=True)
+    return node_data
+
+def process_node_label(label_path, node_type, replication_factor, num_nodes_dict, input_meta):
+    node_label = pd.read_parquet(label_path)
+
+    if replication_factor > 1:
+        node_label = replicate_node_label(node_label, node_type, replication_factor, input_meta)
+
+    node_label_tensor = torch.full((num_nodes_dict[node_type],), -1, dtype=torch.float32)
+    node_label_tensor[torch.as_tensor(node_label.node.values)] = torch.as_tensor(node_label.label.values)
+
+    del node_label
+
+    return {
+        "train_idx": (node_label_tensor > -1).contiguous().nonzero().view(-1),
+        "y": node_label_tensor.contiguous().long()
+    }
+
+
+def replicate_node_label(node_label, node_type, replication_factor, input_meta):
+    base_num_nodes = input_meta["num_nodes"][node_type]
+
+    replicated_df = pd.DataFrame({
+        "node": pd.concat([node_label.node + (r * base_num_nodes) for r in range(1, replication_factor)]),
+        "label": pd.concat([node_label.label for _ in range(1, replication_factor)])
+    })
+
+    return pd.concat([node_label, replicated_df]).reset_index(drop=True)
+
+
+def load_node_features(dataset_path, replication_factor, node_type):
+    print("Loading node features", flush=True)
+    node_type_path = os.path.join(dataset_path, "npy", node_type)
+    if replication_factor == 1:
+        fname =  os.path.join(node_type_path, "node_feat.npy")
+    else:
+        fname = os.path.join(node_type_path, f"node_feat_{replication_factor}x.npy")
+    
+    feat = torch.from_numpy(np.load(fname))
+    print("Loaded node features", flush=True)
+    return feat
diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/model.py b/benchmarks/cugraph-dgl/scale-benchmarks/model.py
new file mode 100644
index 00000000000..506e3bd5227
--- /dev/null
+++ b/benchmarks/cugraph-dgl/scale-benchmarks/model.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+import time
+
+
+class GNN(torch.nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, model_backend='dgl'):
+        if model_backend == 'dgl':
+            from dgl.nn import SAGEConv
+        else:
+            from cugraph_dgl.nn import SAGEConv
+
+        super(GNN, self).__init__()
+        self.convs = torch.nn.ModuleList()
+        for _ in range(num_layers - 1):
+            self.convs.append(SAGEConv(in_channels, hidden_channels, aggregator_type='mean'))
+            in_channels = hidden_channels
+        self.convs.append(SAGEConv(hidden_channels, out_channels, aggregator_type='mean'))
+
+    def forward(self, blocks, x):
+        for i, conv in enumerate(self.convs):
+            x = conv(blocks[i], x)
+            if i != len(self.convs) - 1:
+                x = F.relu(x)
+        return x
+
+
+def create_model(feat_size, num_classes, num_layers, model_backend='dgl'):
+    model = GNN(feat_size, 64, num_classes, num_layers, model_backend=model_backend)
+    model = model.to('cuda')
+    model.train()
+    return model
+
+def train_model(model, dataloader, opt, feat, y):
+    times = {key: 0 for key in ['mfg_creation', 'feature', 'm_fwd', 'm_bkwd']}
+    epoch_st = time.time()
+    mfg_st = time.time()
+    for input_nodes, output_nodes, blocks in dataloader:
+        times['mfg_creation'] += time.time() - mfg_st
+        if feat is not None:
+            fst = time.time()
+            input_nodes = input_nodes.to('cpu')
+            input_feat = feat[input_nodes]
+            input_feat = input_feat.to('cuda')
+            if isinstance(output_nodes, dict):
+                output_nodes = output_nodes['paper']
+            output_nodes = output_nodes.to(y.device)
+            y_batch = y[output_nodes].to('cuda')
+            times['feature'] += time.time() - fst
+
+            m_fwd_st = time.time()
+            y_hat = model(blocks, input_feat)
+            times['m_fwd'] += time.time() - m_fwd_st
+        
+            m_bkwd_st = time.time()
+            loss = F.cross_entropy(y_hat, y_batch)
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            times['m_bkwd'] += time.time() - m_bkwd_st
+        mfg_st = time.time()
+
+    print(f"Epoch time = {time.time() - epoch_st:.2f} seconds")
+    
+    return times
+
+def analyze_time(dataloader, times, epoch_time, fanout, batch_size):
+    num_batches = len(dataloader)
+    time_d = {
+        "fanout": fanout,
+        "batch_size": batch_size,
+        "epoch_time": epoch_time,
+        "epoch_time_per_batch": epoch_time / num_batches,
+        "num_batches": num_batches,
+    }
+    for key, value in times.items():
+        time_d[f"{key}_time_per_epoch"] = value
+        time_d[f"{key}_time_per_batch"] = value / num_batches
+
+    print(f"Time analysis for fanout = {fanout}, batch_size = {batch_size}")
+    for k in time_d.keys():
+        if 'time_per_epoch' in str(k):
+            print(f"{k} = {time_d[k]:.2f} seconds")
+    return time_d
+
+def run_1_epoch(dataloader, feat, y, fanout, batch_size, model_backend):
+    if feat is not None:
+        model = create_model(feat.shape[1], 172, len(fanout), model_backend=model_backend)
+        opt = torch.optim.Adam(model.parameters(), lr=0.01)
+    else:
+        model = None
+        opt = None
+    epoch_st = time.time()
+    times = train_model(model, dataloader, opt, feat, y)
+    epoch_time = time.time() - epoch_st
+    time_d = analyze_time(dataloader, times, epoch_time, fanout, batch_size)
+    return time_d

From c11eff23926dd483d23444a4757629e8ed069683 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Mon, 25 Sep 2023 21:59:29 -0400
Subject: [PATCH 17/22] similarity notebook to compare link prediction algos
 (#3868)

New notebook to compare link prediction
Dependent on dining_prefs being added to datasets api in PR #3866

Authors:
  - Don Acosta (https://github.com/acostadon)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3868
---
 .../link_prediction/similarity_combined.ipynb | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 notebooks/algorithms/link_prediction/similarity_combined.ipynb

diff --git a/notebooks/algorithms/link_prediction/similarity_combined.ipynb b/notebooks/algorithms/link_prediction/similarity_combined.ipynb
new file mode 100644
index 00000000000..cd80ee34002
--- /dev/null
+++ b/notebooks/algorithms/link_prediction/similarity_combined.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Similarity Compared\n",
+    "----\n",
+    "\n",
+    "In this notebook, we will compute vertex similarity scores using the various cuGraph algorithms.  We will then compare the similarities scores in tables.\n",
+    "\n",
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware        |\n",
+    "| --------------|------------|------------------|-----------------|-----------------------|\n",
+    "| Don Acosta    | 09/25/2023 | created          | 23.10 nightly   | AMPERE A6000 CUDA 11.7|\n",
+    "\n",
+    "\n",
+    "**Note: On large graphs these algorithms can take prohibitive time or memory. The notebook will show how to run on defined pairs instead.**\n",
+    "\n",
+    "The Similarity algorithms in cuGraph use different methods to compare pairs of vertices. All of them use the intersection of the set of adjacent nodes for the set overlap. However each of the three algorithms differ on the denominator to determine the similarity coefficients. All three are normalized between zero and one. where zero is no overlap at all and one means identical adjacencies.\n",
+    "\n",
+    "__Jaccard Similarity__<br>\n",
+    "The [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index) measure was developed by botonist, Paul Jaccard who used the measure to compare plant species. His work popularized the measure's use in in other fields as well.\n",
+    "\n",
+    "It can be expressed as:<br>\n",
+    "$\\text{Jaccard similarity} = \\frac{|A \\cap B|}{|A \\cup B|}$\n",
+    "\n",
+    "__Overlap Similarity__<br>\n",
+    "The [Overlap Similarity](https://en.wikipedia.org/wiki/Overlap_coefficient) is also known as the Szymkiewicz–Simpson coefficient. It is often used to compare binary and categorical data in the fields of Genome analysis, recommender systems and anomaly detection. It differs from the Jaccard measure above in that it uses the size of the smaller of the two set sizes as the denominator.\n",
+    "\n",
+    "It can be expressed as\n",
+    "\n",
+    "$oc(A,B)=\\frac{|A|\\cap|B|}{min(|A|,|B|)}$\n",
+    "\n",
+    "__Sørensen-Dice Coefficient__<br>\n",
+    "The [Sørensen coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient#) is known as the Sørensen-Dice coefficient. It was independently developed for use by botonists Lee Raymond Dice and  Thorvald Sørensen. Although originating in the field of Botony, the coefficient is now used in computer vision, Natural Language Processing(NLP) and Data Mining among other fields.\n",
+    "It differs from Jaccard and Overlap in that the calculation doubles the intersection size and divides it by the sum of the two set sizes.\n",
+    "\n",
+    "It can be expressed as\n",
+    "\n",
+    "Sørensen coefficient = $\\left(2 * |A \\cap B| \\right) \\over \\left(|A| + |B| \\right)$\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "# Now for the code !"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the required dependencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cugraph\n",
+    "from cugraph.datasets import dining_prefs\n",
+    "# only needed to display results in a table \n",
+    "from IPython.display import display_html "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Function that calls all the cuGraph similarity/link prediction algorithms "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_similarity(G,pairs=None):\n",
+    "    _jdf = cugraph.jaccard(G,pairs)\n",
+    "    _jdf2 = _jdf[ (_jdf['first'] != _jdf['second'] ) ]\n",
+    "    _odf = cugraph.overlap(G,pairs)\n",
+    "    _odf2 = _odf[ (_odf['first'] != _odf['second'] ) ]\n",
+    "    _sdf = cugraph.sorensen_coefficient(G,pairs)\n",
+    "    _sdf2 = _sdf[ (_sdf['first'] != _sdf['second'] ) ]\n",
+    "    return _jdf2, _odf2, _sdf2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Function to put all the results in a convenient table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print function\n",
+    "def print_similarity(jdf,odf,sdf,num_records=5):\n",
+    "\n",
+    "    js_top = jdf.sort_values(by='jaccard_coeff', ascending=False).head(num_records).to_pandas()\n",
+    "    os_top = odf.sort_values(by='overlap_coeff', ascending=False).head(num_records).to_pandas()\n",
+    "    ss_top = sdf.sort_values(by='sorensen_coeff', ascending=False).head(num_records).to_pandas()\n",
+    "    \n",
+    "    df1_styler = js_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Jaccard').hide(axis='index')\n",
+    "    df2_styler = os_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Overlap').hide(axis='index')\n",
+    "    df3_styler = ss_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Sørensen').hide(axis='index')\n",
+    "\n",
+    "    display_html(df1_styler._repr_html_()+df2_styler._repr_html_()+df3_styler._repr_html_(), raw=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the graph from the Dining preferences data set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G = dining_prefs.get_graph(download=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the three similarity Algorithms and print out the five links with the highest scores."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jdf, odf, sdf = compute_similarity(G)\n",
+    "print_similarity(jdf,odf,sdf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now find the the complete set of two-hop neigbors and compare them instead of just using the existing one-hop edges. In a larger graph, this will run considerably faster since the default "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this cugraph algorithm pulls a set containing every pair of vertices\n",
+    "# that are within 2-hops of each other\n",
+    "two_hops_pairs = G.get_two_hop_neighbors()\n",
+    "\n",
+    "jdf_hops, odf_hops, sdf_hops = compute_similarity(G,pairs=two_hops_pairs)\n",
+    "print_similarity(jdf_hops,odf_hops,sdf_hops)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### It's that easy with cuGraph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "----\n",
+    "Copyright (c) 2023, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_0802",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From b199bf01a2dbb3a8bc89198c6d35fd5a0444e213 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Tue, 26 Sep 2023 05:33:13 -0700
Subject: [PATCH 18/22] [REVIEW] Add Pure DGL Dataloading benchmark (#3660)

This PR adds the DGL data loading benchmark:

Arguments supported:
- dataset_path: path to the dataset
- replication_factors: replication factors for number of edges
- fanouts: fanouts
- batch_sizes: batch sizes

```bash
python3 dgl_dataloading.py --dataset_path "/datasets/abarghi/ogbn_papers100M" \
--replication_factors "1,2,4" \
--fanouts "25_25,10_10_10,5_10_20" \
--batch_sizes "512,1024"
```


This produces the following results on a V100:

| Fanout | Batch Size | Data Loading Time per Epoch | Data Loading Time per Batch | Number of Edges | Number of Batches | Replication Factor |
|--------|------------|-----------------------------|-----------------------------|-----------------|-------------------|--------------------|
| [25, 25] | 512 | 9.48 | 0.0031 | 1615685872 | 3022 | 1 |
| [25, 25] | 1024 | 6.39 | 0.0042 | 1615685872 | 1511 | 1 |
| [10, 10, 10] | 512 | 15.91 | 0.0053 | 1615685872 | 3022 | 1 |
| [10, 10, 10] | 1024 | 11.64 | 0.0077 | 1615685872 | 1511 | 1 |
| [5, 10, 20] | 512 | 17.73 | 0.0059 | 1615685872 | 3022 | 1 |
| [5, 10, 20] | 1024 | 13.52 | 0.0089 | 1615685872 | 1511 | 1 |
| [25, 25] | 512 | 19.44 | 0.0032 | 3231371744 | 6043 | 2 |
| [25, 25] | 1024 | 12.98 | 0.0043 | 3231371744 | 3022 | 2 |
| [10, 10, 10] | 512 | 32.88 | 0.0054 | 3231371744 | 6043 | 2 |
| [10, 10, 10] | 1024 | 24.35 | 0.0081 | 3231371744 | 3022 | 2 |
| [5, 10, 20] | 512 | 38.35 | 0.0063 | 3231371744 | 6043 | 2 |
| [5, 10, 20] | 1024 | 28.93 | 0.0096 | 3231371744 | 3022 | 2 |
| [25, 25] | 512 | 37.31 | 0.0031 | 6462743488 | 12085 | 4 |
| [25, 25] | 1024 | 25.15 | 0.0042 | 6462743488 | 6043 | 4 |
| [10, 10, 10] | 512 | 64.29 | 0.0053 | 6462743488 | 12085 | 4 |
| [10, 10, 10] | 1024 | 47.13 | 0.0078 | 6462743488 | 6043 | 4 |
| [5, 10, 20] | 512 | 72.90 | 0.0060 | 6462743488 | 12085 | 4 |
| [5, 10, 20] | 1024 | 56.70 | 0.0094 | 6462743488 | 6043 | 4 |
| [25, 25] | 512 | 80.99 | 0.0034 | 12925486976 | 24169 | 8 |
| [25, 25] | 1024 | 50.89 | 0.0042 | 12925486976 | 12085 | 8 |
| [10, 10, 10] | 512 | 129.49 | 0.0054 | 12925486976 | 24169 | 8 |
| [10, 10, 10] | 1024 | 93.66 | 0.0078 | 12925486976 | 12085 | 8 |
| [5, 10, 20] | 512 | 143.45 | 0.0059 | 12925486976 | 24169 | 8 |
| [5, 10, 20] | 1024 | 110.22 | 0.0091 | 12925486976 | 12085 | 8 |

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Tingyu Wang (https://github.com/tingyu66)

URL: https://github.com/rapidsai/cugraph/pull/3660
---
 .../dgl_benchmark.py                          | 291 ++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py

diff --git a/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py b/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py
new file mode 100644
index 00000000000..0a52703c546
--- /dev/null
+++ b/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import dgl
+import torch
+import pandas as pd
+import os
+import time
+import json
+import random
+import numpy as np
+from argparse import ArgumentParser
+
+
+def load_edges_from_disk(parquet_path, replication_factor, input_meta):
+    """
+    Load the edges from disk into a graph data dictionary.
+    Args:
+        parquet_path: Path to the parquet directory.
+        replication_factor: Number of times to replicate the edges.
+        input_meta: Input meta data.
+    Returns:
+        dict: Dictionary of edge types to a tuple of (src, dst)
+    """
+    graph_data = {}
+    for edge_type in input_meta["num_edges"].keys():
+        print(
+            f"Loading edge index for edge type {edge_type}"
+            f"for replication factor = {replication_factor}"
+        )
+        can_edge_type = tuple(edge_type.split("__"))
+        # TODO: Rename `edge_index` to a better name
+        ei = pd.read_parquet(
+            os.path.join(parquet_path, edge_type, "edge_index.parquet")
+        )
+        ei = {
+            "src": torch.from_numpy(ei.src.values),
+            "dst": torch.from_numpy(ei.dst.values),
+        }
+        if replication_factor > 1:
+            src_ls = [ei["src"]]
+            dst_ls = [ei["dst"]]
+            for r in range(1, replication_factor):
+                new_src = ei["src"] + (
+                    r * input_meta["num_nodes"][can_edge_type[0]]
+                )
+                src_ls.append(new_src)
+                new_dst = ei["dst"] + (
+                    r * input_meta["num_nodes"][can_edge_type[2]]
+                )
+                dst_ls.append(new_dst)
+
+            ei["src"] = torch.cat(src_ls).contiguous()
+            ei["dst"] = torch.cat(dst_ls).contiguous()
+        graph_data[can_edge_type] = ei["src"], ei["dst"]
+    print("Graph Data compiled")
+    return graph_data
+
+
+def load_node_labels(dataset_path, replication_factor, input_meta):
+    num_nodes_dict = {
+        node_type: t * replication_factor
+        for node_type, t in input_meta["num_nodes"].items()
+    }
+    node_data = {}
+    for node_type in input_meta["num_nodes"].keys():
+        node_data[node_type] = {}
+        label_path = os.path.join(
+            dataset_path, "parquet", node_type, "node_label.parquet"
+        )
+        if os.path.exists(label_path):
+            node_label = pd.read_parquet(label_path)
+            if replication_factor > 1:
+                base_num_nodes = input_meta["num_nodes"][node_type]
+                dfr = pd.DataFrame(
+                    {
+                        "node": pd.concat(
+                            [
+                                node_label.node + (r * base_num_nodes)
+                                for r in range(1, replication_factor)
+                            ]
+                        ),
+                        "label": pd.concat(
+                            [
+                                node_label.label
+                                for r in range(1, replication_factor)
+                            ]
+                        ),
+                    }
+                )
+                node_label = pd.concat([node_label, dfr]).reset_index(
+                    drop=True
+                )
+
+            node_label_tensor = torch.full(
+                (num_nodes_dict[node_type],), -1, dtype=torch.float32
+            )
+            node_label_tensor[
+                torch.as_tensor(node_label.node.values)
+            ] = torch.as_tensor(node_label.label.values)
+
+            del node_label
+            node_data[node_type]["train_idx"] = (
+                (node_label_tensor > -1).contiguous().nonzero().view(-1)
+            )
+            node_data[node_type]["y"] = node_label_tensor.contiguous()
+        else:
+            node_data[node_type]["num_nodes"] = num_nodes_dict[node_type]
+    return node_data
+
+
+def create_dgl_graph_from_disk(dataset_path, replication_factor=1):
+    """
+    Create a DGL graph from a dataset on disk.
+    Args:
+        dataset_path: Path to the dataset on disk.
+        replication_factor: Number of times to replicate the edges.
+    Returns:
+        DGLGraph: DGLGraph with the loaded dataset.
+    """
+    with open(os.path.join(dataset_path, "meta.json"), "r") as f:
+        input_meta = json.load(f)
+
+    parquet_path = os.path.join(dataset_path, "parquet")
+    graph_data = load_edges_from_disk(
+        parquet_path, replication_factor, input_meta
+    )
+    node_data = load_node_labels(dataset_path, replication_factor, input_meta)
+    g = dgl.heterograph(graph_data)
+
+    return g, node_data
+
+
+def create_dataloader(g, train_idx, batch_size, fanouts, use_uva):
+    """
+    Create a DGL dataloader from a DGL graph.
+    Args:
+        g: DGLGraph to create the dataloader from.
+        train_idx: Tensor containing the training indices.
+        batch_size: Batch size to use for the dataloader.
+        fanouts: List of fanouts to use for the dataloader.
+        use_uva: Whether to use unified virtual address space.
+    Returns:
+        DGLGraph: DGLGraph with the loaded dataset.
+    """
+    
+    print("Creating dataloader", flush=True)
+    st = time.time()
+    if use_uva:
+        train_idx = {k: v.to("cuda") for k, v in train_idx.items()}
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts=fanouts)
+    dataloader = dgl.dataloading.DataLoader(
+        g,
+        train_idx,
+        sampler,
+        num_workers=0,
+        batch_size=batch_size,
+        use_uva=use_uva,
+        shuffle=False,
+        drop_last=False,
+    )
+    et = time.time()
+    print(f"Time to create dataloader = {et - st:.2f} seconds")
+    return dataloader
+
+
+def dataloading_benchmark(g, train_idx, fanouts, batch_sizes, use_uva):
+    """
+    Run the dataloading benchmark.
+    Args:
+        g: DGLGraph
+        train_idx: Tensor containing the training indices.
+        fanouts: List of fanouts to use for the dataloader.
+        batch_sizes: List of batch sizes to use for the dataloader.
+        use_uva: Whether to use unified virtual address space.
+    """
+    time_ls = []
+    for fanout in fanouts:
+        for batch_size in batch_sizes:
+            dataloader = create_dataloader(
+                g,
+                train_idx,
+                batch_size=batch_size,
+                fanouts=fanout,
+                use_uva=use_uva,
+            )
+            dataloading_st = time.time()
+            for input_nodes, output_nodes, blocks in dataloader:
+                pass
+            dataloading_et = time.time()
+            dataloading_time = dataloading_et - dataloading_st
+            time_d = {
+                "fanout": fanout,
+                "batch_size": batch_size,
+                "dataloading_time_per_epoch": dataloading_time,
+                "dataloading_time_per_batch": dataloading_time / len(dataloader),
+                "num_edges": g.num_edges(),
+                "num_batches": len(dataloader),
+            }
+            time_ls.append(time_d)
+
+            print("Dataloading completed")
+            print(f"Fanout = {fanout}, batch_size = {batch_size}")
+            print(
+                f"Time taken {dataloading_time:.2f} ",
+                f"seconds for num batches {len(dataloader)}",
+                flush=True,
+            )
+            print("==============================================")
+    return time_ls
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--dataset_path", type=str, default="/datasets/abarghi/ogbn_papers100M"
+    )
+    parser.add_argument("--replication_factors", type=str, default="1,2,4,8")
+    parser.add_argument(
+        "--fanouts", type=str, default="25_25,10_10_10,5_10_20"
+    )
+    parser.add_argument("--batch_sizes", type=str, default="512,1024")
+    parser.add_argument("--do_not_use_uva", action="store_true")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    if args.do_not_use_uva:
+        use_uva = False
+    else:
+        use_uva = True
+    set_seed(args.seed)
+    replication_factors = [int(x) for x in args.replication_factors.split(",")]
+    fanouts = [[int(y) for y in x.split("_")] for x in args.fanouts.split(",")]
+    batch_sizes = [int(x) for x in args.batch_sizes.split(",")]
+
+    print("Running dgl dataloading benchmark with the following parameters:")
+    print(f"Dataset path = {args.dataset_path}")
+    print(f"Replication factors = {replication_factors}")
+    print(f"Fanouts = {fanouts}")
+    print(f"Batch sizes = {batch_sizes}")
+    print(f"Use UVA = {use_uva}")
+    print("==============================================")
+
+    time_ls = []
+    for replication_factor in replication_factors:
+        st = time.time()
+        g, node_data = create_dgl_graph_from_disk(
+            dataset_path=args.dataset_path,
+            replication_factor=replication_factor,
+        )
+        et = time.time()
+        print(f"Replication factor = {replication_factor}")
+        print(
+            f"G has {g.num_edges()} edges and took",
+            f" {et - st:.2f} seconds to load"
+        )
+        train_idx = {"paper": node_data["paper"]["train_idx"]}
+        r_time_ls = dataloading_benchmark(
+            g, train_idx, fanouts, batch_sizes, use_uva=use_uva
+        )
+        print(
+            "Benchmark completed for replication factor = ", replication_factor
+        )
+        print("==============================================")
+        # Add replication factor to the time list
+        [
+            x.update({"replication_factor": replication_factor})
+            for x in r_time_ls
+        ]
+        time_ls.extend(r_time_ls)
+
+    df = pd.DataFrame(time_ls)
+    df.to_csv("dgl_dataloading_benchmark.csv", index=False)
+    print("Benchmark completed for all replication factors")
+    print("==============================================")

From 8b02e241617df8bac33d0fd69b03046d2ddaf2d9 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 26 Sep 2023 15:43:02 +0200
Subject: [PATCH 19/22] Enable temporarily disabled MG tests (#3837)

Enable TEMPORARILY disable single-GPU "MG" tests
And Skip deleting copied Dataframe while creating distributed graph from cudf edge-lists.
Ideally we would like to merger this PR once the [issue 3790](https://github.com/rapidsai/cugraph/issues/3790) is closed, but me  might need to merger it if the issue is not resolved before the next release.

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cugraph/pull/3837
---
 ci/test_python.sh                                           | 6 +-----
 ci/test_wheel.sh                                            | 4 +---
 .../graph_implementation/simpleDistributedGraph.py          | 5 ++++-
 python/cugraph/cugraph/tests/traversal/test_bfs_mg.py       | 5 ++++-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 14886909fc9..7b0077991ae 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -63,10 +63,6 @@ pytest \
   tests
 popd
 
-# FIXME: TEMPORARILY disable single-GPU "MG" testing until
-# https://github.com/rapidsai/cugraph/issues/3790 is closed
-# When closed, replace -k "not _mg" with
-#  -k "not test_property_graph_mg" \
 rapids-logger "pytest cugraph"
 pushd python/cugraph/cugraph
 export DASK_WORKER_DEVICES="0"
@@ -79,7 +75,7 @@ pytest \
   --cov=cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-coverage.xml" \
   --cov-report=term \
-  -k "not _mg" \
+  -k "not test_property_graph_mg" \
   tests
 popd
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index b62635d08b4..146186ae2e7 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -18,7 +18,5 @@ arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    # FIXME: TEMPORARILY disable single-GPU "MG" testing until
-    # https://github.com/rapidsai/cugraph/issues/3790 is closed
-    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest -k "not _mg" ./python/${package_name}/${package_name}/tests
+    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${package_name}/tests
 fi
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 0586d0d853c..01885c2d1c3 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -14,6 +14,7 @@
 import gc
 from typing import Union
 import warnings
+import random
 
 import cudf
 import cupy as cp
@@ -182,7 +183,9 @@ def __from_edgelist(
         # Repartition to 2 partitions per GPU for memory efficient process
         input_ddf = input_ddf.repartition(npartitions=len(workers) * 2)
         # FIXME: Make a copy of the input ddf before implicitly altering it.
-        input_ddf = input_ddf.map_partitions(lambda df: df.copy())
+        input_ddf = input_ddf.map_partitions(
+            lambda df: df.copy(), token="custom-" + str(random.random())
+        )
         # The dataframe will be symmetrized iff the graph is undirected
         # otherwise, the inital dataframe will be returned
         if edge_attr is not None:
diff --git a/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py b/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
index 8ffbecea4fc..5eafc231141 100644
--- a/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
+++ b/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 import gc
+import random
 
 import pytest
 
@@ -61,7 +62,9 @@ def modify_dataset(df):
         return cudf.concat([df, temp_df])
 
     meta = ddf._meta
-    ddf = ddf.map_partitions(modify_dataset, meta=meta)
+    ddf = ddf.map_partitions(
+        modify_dataset, meta=meta, token="custom-" + str(random.random())
+    )
 
     df = cudf.read_csv(
         input_data_path,

From a9f4297223593f8df211599277519e206c597630 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Tue, 26 Sep 2023 08:57:51 -0500
Subject: [PATCH 20/22] Enable weights for MG similarity algorithms (#3879)

This is a follow up PR to #3828 which enabled weighted for the python SG similarity algorithms.
This PR also updates the tests, docstrings and remove experimental calls

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3879
---
 .../cugraph/dask/link_prediction/jaccard.py   | 10 +---
 .../cugraph/dask/link_prediction/overlap.py   | 10 +---
 .../cugraph/dask/link_prediction/sorensen.py  | 10 +---
 .../tests/link_prediction/test_jaccard_mg.py  | 59 ++++++-------------
 .../tests/link_prediction/test_overlap_mg.py  | 59 ++++++-------------
 .../tests/link_prediction/test_sorensen_mg.py | 59 ++++++-------------
 6 files changed, 60 insertions(+), 147 deletions(-)

diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
index 218e6206fc3..5362c7a9e1e 100644
--- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -118,7 +118,9 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted jaccard (if use_weight==True)
+        or un-weighted jaccard (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -144,12 +146,6 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py
index 5540be28fd1..4bda05e3c95 100644
--- a/python/cugraph/cugraph/dask/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py
@@ -96,7 +96,9 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted overlap (if use_weight==True)
+        or un-weighted overlap (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -122,12 +124,6 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
index 24295ac330c..163b0d0dc16 100644
--- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
@@ -92,7 +92,9 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted sorensen (if use_weight==True)
+        or un-weighted sorensen (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -118,12 +120,6 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
index b56a6baae2b..ee739c9f236 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
@@ -34,6 +34,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -48,6 +49,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -57,7 +59,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -72,7 +76,10 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -84,7 +91,9 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_jaccard = cugraph.experimental.jaccard(G, input_combo["vertex_pair"])
+    sg_cugraph_jaccard = cugraph.jaccard(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -104,6 +113,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -122,8 +132,11 @@ def input_expected_output(input_combo):
 def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_jaccard = benchmark(dcg.jaccard, dg, input_expected_output["vertex_pair"])
+    result_jaccard = benchmark(
+        dcg.jaccard, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_jaccard = (
         result_jaccard.compute()
@@ -151,41 +164,3 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     assert len(jaccard_coeff_diffs1) == 0
     assert len(jaccard_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_jaccard(dask_client):
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.jaccard(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.jaccard(dg, use_weight=use_weight)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
index ce4bf619f47..87407d7b59c 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
@@ -34,6 +34,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -48,6 +49,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -57,7 +59,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -72,7 +76,10 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -84,7 +91,9 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_overlap = cugraph.experimental.overlap(G, input_combo["vertex_pair"])
+    sg_cugraph_overlap = cugraph.overlap(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -104,6 +113,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -125,8 +135,11 @@ def input_expected_output(input_combo):
 def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_overlap = benchmark(dcg.overlap, dg, input_expected_output["vertex_pair"])
+    result_overlap = benchmark(
+        dcg.overlap, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_overlap = (
         result_overlap.compute()
@@ -154,41 +167,3 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     assert len(overlap_coeff_diffs1) == 0
     assert len(overlap_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_overlap():
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.overlap(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.overlap(dg, use_weight=use_weight)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
index af6b60771a0..66832d08427 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
@@ -35,6 +35,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -49,6 +50,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -58,7 +60,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -73,7 +77,10 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -85,7 +92,9 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_sorensen = cugraph.experimental.sorensen(G, input_combo["vertex_pair"])
+    sg_cugraph_sorensen = cugraph.sorensen(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -105,6 +114,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -124,8 +134,11 @@ def input_expected_output(input_combo):
 def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_sorensen = benchmark(dcg.sorensen, dg, input_expected_output["vertex_pair"])
+    result_sorensen = benchmark(
+        dcg.sorensen, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_sorensen = (
         result_sorensen.compute()
@@ -153,41 +166,3 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     assert len(sorensen_coeff_diffs1) == 0
     assert len(sorensen_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_sorensen(dask_client):
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.sorensen(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.sorensen(dg, use_weight=use_weight)

From 5c34d3dd340c76678d8f2667057c6b0ce2f1f480 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Tue, 26 Sep 2023 11:57:30 -0400
Subject: [PATCH 21/22] Update Allocator Selection in cuGraph-DGL Example
 (#3877)

Closes #3847

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3877
---
 .../cugraph-dgl/examples/graphsage/node-classification.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
index 24df73ada75..320890b0312 100644
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ b/python/cugraph-dgl/examples/graphsage/node-classification.py
@@ -39,14 +39,16 @@
 
 
 def set_allocators():
+    import rmm
     import cudf
     import cupy
-    import rmm
+    from rmm.allocators.torch import rmm_torch_allocator
+    from rmm.allocators.cupy import rmm_cupy_allocator
 
     mr = rmm.mr.CudaAsyncMemoryResource()
     rmm.mr.set_current_device_resource(mr)
-    torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
-    cupy.cuda.set_allocator(rmm.allocators.cupy.rmm_cupy_allocator)
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
     cudf.set_option("spill", True)
 
 
From 5d2f5486899bdd2d71c00b12fb26afbdd60100d1 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:59:55 -0500
Subject: [PATCH 22/22] Updates to build and test `nx-cugraph` wheel as part of
 CI and nightly workflows (#3852)

closes rapidsai/graph_dl#302

* Updates GHA yaml files to build and test a `nx-cugraph` wheel
* Adds CI scripts for building and testing the `nx-cugraph` wheel
* Adds a smoketest script for `nx-cugraph`
* Relevant code cleanup: removes unnecessary dataset download from cugraph wheel testing

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cugraph/pull/3852
---
 .github/workflows/build.yaml      | 20 ++++++++++++++++
 .github/workflows/pr.yaml         | 16 +++++++++++++
 .github/workflows/test.yaml       |  9 ++++++++
 ci/build_wheel.sh                 | 12 ++++++----
 ci/build_wheel_nx-cugraph.sh      |  6 +++++
 ci/test_wheel.sh                  |  9 +++++---
 ci/test_wheel_cugraph.sh          |  8 -------
 ci/test_wheel_nx-cugraph.sh       |  6 +++++
 ci/wheel_smoke_test_nx-cugraph.py | 38 +++++++++++++++++++++++++++++++
 9 files changed, 109 insertions(+), 15 deletions(-)
 create mode 100755 ci/build_wheel_nx-cugraph.sh
 create mode 100755 ci/test_wheel_nx-cugraph.sh
 create mode 100644 ci/wheel_smoke_test_nx-cugraph.py

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 02b357c7c88..c01a6fcb94a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -112,3 +112,23 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cugraph
+  wheel-build-nx-cugraph:
+    needs: wheel-publish-pylibcugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_nx-cugraph.sh
+  wheel-publish-nx-cugraph:
+    needs: wheel-build-nx-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: nx-cugraph
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d2d24d90fbe..d49ae7f8d3d 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,6 +23,8 @@ jobs:
       - wheel-tests-pylibcugraph
       - wheel-build-cugraph
       - wheel-tests-cugraph
+      - wheel-build-nx-cugraph
+      - wheel-tests-nx-cugraph
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
@@ -109,3 +111,17 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
+  wheel-build-nx-cugraph:
+    needs: wheel-tests-pylibcugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_nx-cugraph.sh
+  wheel-tests-nx-cugraph:
+    needs: wheel-build-nx-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_nx-cugraph.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 97abca71260..dc9ed60b29e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -48,3 +48,12 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_cugraph.sh
+  wheel-tests-nx-cugraph:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_nx-cugraph.sh
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 3798d561126..821aa25c1b9 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -49,7 +49,11 @@ cd "${package_dir}"
 
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
-mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
-
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+# pure-python packages should not have auditwheel run on them.
+if [[ ${package_name} == "nx-cugraph" ]]; then
+    RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 dist
+else
+    mkdir -p final_dist
+    python -m auditwheel repair -w final_dist dist/*
+    RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+fi
diff --git a/ci/build_wheel_nx-cugraph.sh b/ci/build_wheel_nx-cugraph.sh
new file mode 100755
index 00000000000..4481de1283d
--- /dev/null
+++ b/ci/build_wheel_nx-cugraph.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+./ci/build_wheel.sh nx-cugraph python/nx-cugraph
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 146186ae2e7..3ac3549f143 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -6,17 +6,20 @@ set -eoxu pipefail
 package_name=$1
 package_dir=$2
 
+python_package_name=$(echo ${package_name}|sed 's/-/_/g')
+
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# echo to expand wildcard before adding `[extra]` requires for pip
+# use 'ls' to expand wildcard before adding `[extra]` requires for pip
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-python -m pip install $(echo ./dist/${package_name}*.whl)[test]
+# pip creates wheels using python package names
+python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
 
 # Run smoke tests for aarch64 pull requests
 arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${package_name}/tests
+    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${python_package_name}/tests
 fi
diff --git a/ci/test_wheel_cugraph.sh b/ci/test_wheel_cugraph.sh
index 4d511ac2a0f..f9e2aa6d8da 100755
--- a/ci/test_wheel_cugraph.sh
+++ b/ci/test_wheel_cugraph.sh
@@ -11,12 +11,4 @@ python -m pip install --no-deps ./local-pylibcugraph-dep/pylibcugraph*.whl
 # Always install latest dask for testing
 python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main
 
-# Only download test data for x86
-arch=$(uname -m)
-if [[ "${arch}" == "x86_64" ]]; then
-    pushd ./datasets
-    bash ./get_test_data.sh
-    popd
-fi
-
 ./ci/test_wheel.sh cugraph python/cugraph
diff --git a/ci/test_wheel_nx-cugraph.sh b/ci/test_wheel_nx-cugraph.sh
new file mode 100755
index 00000000000..53d40960fc3
--- /dev/null
+++ b/ci/test_wheel_nx-cugraph.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -eoxu pipefail
+
+./ci/test_wheel.sh nx-cugraph python/nx-cugraph
diff --git a/ci/wheel_smoke_test_nx-cugraph.py b/ci/wheel_smoke_test_nx-cugraph.py
new file mode 100644
index 00000000000..10d26e3aac7
--- /dev/null
+++ b/ci/wheel_smoke_test_nx-cugraph.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import networkx as nx
+import nx_cugraph as nxcg
+
+
+if __name__ == "__main__":
+    G = nx.Graph()
+    G.add_edges_from([(0, 1), (1, 2), (2, 3)])
+
+    nx_result = nx.betweenness_centrality(G)
+    # nx_cugraph is intended to be called via the NetworkX dispatcher, like
+    # this:
+    #    nxcu_result = nx.betweenness_centrality(G, backend="cugraph")
+    #
+    # but here it is being called directly since the NetworkX version that
+    # supports the "backend" kwarg may not be available in the testing env.
+    nxcu_result = nxcg.betweenness_centrality(G)
+
+    nx_nodes, nxcu_nodes = nx_result.keys(), nxcu_result.keys()
+    assert nxcu_nodes == nx_nodes
+    for node_id in nx_nodes:
+        nx_bc, nxcu_bc = nx_result[node_id], nxcu_result[node_id]
+        assert math.isclose(nx_bc, nxcu_bc, rel_tol=1e-6), \
+            f"bc for {node_id=} exceeds tolerance: {nx_bc=}, {nxcu_bc=}"