diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9ff575865e3..96e491f4cb4 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -33,6 +33,3 @@ ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
 ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
-
-# cugraph_pyg's setup.py needs this defined when building in a conda env
-ENV CUDA_HOME="${CUDA_HOME:-/home/coder/.conda/envs/$DEFAULT_CONDA_ENV}"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 7558b4c0d7b..739996a264c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -113,22 +113,3 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cugraph
-  wheel-build-cugraph-equivariant:
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
-    with:
-      build_type: ${{ inputs.build_type || 'branch' }}
-      branch: ${{ inputs.branch }}
-      sha: ${{ inputs.sha }}
-      date: ${{ inputs.date }}
-      script: ci/build_wheel_cugraph-equivariant.sh
-  wheel-publish-cugraph-equivariant:
-    needs: wheel-build-cugraph-equivariant
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
-    with:
-      build_type: ${{ inputs.build_type || 'branch' }}
-      branch: ${{ inputs.branch }}
-      sha: ${{ inputs.sha }}
-      date: ${{ inputs.date }}
-      package-name: cugraph-equivariant
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 2fdc9e5198d..bb2a5ebc6ce 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,8 +25,6 @@ jobs:
       - wheel-tests-pylibcugraph
       - wheel-build-cugraph
       - wheel-tests-cugraph
-      - wheel-build-cugraph-equivariant
-      - wheel-tests-cugraph-equivariant
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
@@ -161,21 +159,6 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
-  wheel-build-cugraph-equivariant:
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
-    with:
-      build_type: pull-request
-      script: ci/build_wheel_cugraph-equivariant.sh
-  wheel-tests-cugraph-equivariant:
-    needs: [wheel-build-cugraph-equivariant, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
-    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
-    with:
-      build_type: pull-request
-      script: ci/test_wheel_cugraph-equivariant.sh
-      matrix_filter: map(select(.ARCH == "amd64"))
   devcontainer:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b0a05ce4eb7..4aa698c987f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -58,13 +58,3 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_cugraph.sh
-  wheel-tests-cugraph-equivariant:
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
-    with:
-      build_type: nightly
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      script: ci/test_wheel_cugraph-equivariant.sh
-      matrix_filter: map(select(.ARCH == "amd64"))
diff --git a/.gitignore b/.gitignore
index 2fea1022910..9480c2618bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,9 +78,6 @@ datasets/*
 !datasets/karate-disjoint.csv
 !datasets/netscience.csv
 
-# nx-cugraph side effects
-python/nx-cugraph/objects.inv
-
 .pydevproject
 
 # Jupyter Notebooks
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b5fbcf9ad42..4bb037b5fda 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,6 @@ repos:
         language_version: python3
         args: [--target-version=py310]
         files: ^(python/.*|benchmarks/.*)$
-        exclude: ^python/nx-cugraph/
   - repo: https://github.com/PyCQA/flake8
     rev: 7.1.1
     hooks:
@@ -59,23 +58,3 @@ repos:
     hooks:
         - id: rapids-dependency-file-generator
           args: ["--clean"]
-  - repo: local
-    hooks:
-      - id: nx-cugraph-meta-data-update
-        name: nx-cugraph meta-data updater
-        entry: bash -c "PYTHONPATH=./python/nx-cugraph python ./python/nx-cugraph/_nx_cugraph/__init__.py"
-        files: ^python/nx-cugraph/
-        types: [python]
-        language: python
-        pass_filenames: false
-        additional_dependencies: ["networkx>=3.4"]
-  - repo: local
-    hooks:
-      - id: nx-cugraph-readme-update
-        name: nx-cugraph README updater
-        entry: bash -c "PYTHONPATH=./python/nx-cugraph python ./python/nx-cugraph/scripts/update_readme.py ./python/nx-cugraph/README.md"
-        files: ^python/nx-cugraph/
-        types_or: [python, markdown]
-        language: python
-        pass_filenames: false
-        additional_dependencies: ["networkx>=3.4"]
diff --git a/README.md b/README.md
index 8026e4feb64..857406075e0 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@
 -----
 ## News
 
-___NEW!___   _[nx-cugraph](./python/nx-cugraph/README.md)_, a NetworkX backend that provides GPU acceleration to NetworkX with zero code change.
+___NEW!___   _[nx-cugraph](https://rapids.ai/nx-cugraph/)_, a NetworkX backend that provides GPU acceleration to NetworkX with zero code change.
 ```
 > pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com
 > export NETWORKX_AUTOMATIC_BACKENDS=cugraph
@@ -62,9 +62,8 @@ That's it.  NetworkX now leverages cuGraph for accelerated graph algorithms.
     - [External Data Types](./readme_pages/data_types.md)
   - [pylibcugraph](./readme_pages/pylibcugraph.md)
   - [libcugraph (C/C++/CUDA)](./readme_pages/libcugraph.md)
-  - [nx-cugraph](./python/nx-cugraph/README.md)
+  - [nx-cugraph](https://rapids.ai/nx-cugraph/)
   - [cugraph-service](./readme_pages/cugraph_service.md)
-  - [cugraph-dgl](./readme_pages/cugraph_dgl.md)
   - [cugraph-ops](./readme_pages/cugraph_ops.md)
 - API Docs
   - Python
@@ -127,7 +126,7 @@ df_page.sort_values('pagerank', ascending=False).head(10)
 * ArangoDB - a free and open-source native multi-model database system  - https://www.arangodb.com/
 * CuPy - "NumPy/SciPy-compatible Array Library for GPU-accelerated Computing with Python" -  https://cupy.dev/
 * Memgraph - In-memory Graph database - https://memgraph.com/
-* NetworkX (via [nx-cugraph](./python/nx-cugraph/README.md) backend) - an extremely popular, free and open-source package for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks - https://networkx.org/
+* NetworkX (via [nx-cugraph](https://rapids.ai/nx-cugraph/) backend) - an extremely popular, free and open-source package for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks - https://networkx.org/
 * PyGraphistry - free and open-source GPU graph ETL, AI, and visualization, including native RAPIDS & cuGraph support - http://github.com/graphistry/pygraphistry
 * ScanPy - a scalable toolkit for analyzing single-cell gene expression data - https://scanpy.readthedocs.io/en/stable/
 
diff --git a/benchmarks/nx-cugraph/pytest-based/README.md b/benchmarks/nx-cugraph/pytest-based/README.md
deleted file mode 100644
index 414a22171a0..00000000000
--- a/benchmarks/nx-cugraph/pytest-based/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-## `nx-cugraph` Benchmarks
-
-### Overview
-
-This directory contains a set of scripts designed to benchmark NetworkX with the `nx-cugraph` backend and deliver a report that summarizes the speed-up and runtime deltas over default NetworkX.
-
-Our current benchmarks provide the following datasets:
-
-| Dataset     | Nodes | Edges | Directed |
-| --------    | ------- | ------- | ------- |
-| netscience  | 1,461    | 5,484 | Yes |
-| email-Eu-core  | 1,005    | 25,571 | Yes |
-| amazon0302  | 262,111  | 1,234,877 | Yes |
-| cit-Patents  | 3,774,768    | 16,518,948 | Yes |
-| hollywood  | 1,139,905    | 57,515,616 | No |
-| soc-LiveJournal1  | 4,847,571    | 68,993,773 | Yes |
-
-
-
-### Scripts
-
-#### 1. `run-main-benchmarks.sh`
-This script allows users to run a small set of commonly-used algorithms across multiple datasets and backends. All results are stored inside a sub-directory (`logs/`) and output files are named based on the combination of parameters for that benchmark.
-
-NOTE:
- - If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
- - The `betweenness_centrality` benchmark will run with values `[10, 20, 50, 100, 500, 1000]` by default. You can specify only specific k-values to be run by editing `bc_k_values` (line 46) to be passed as a [pytest keyword object](https://docs.pytest.org/en/6.2.x/usage.html#specifying-tests-selecting-tests).
-
-**Usage:**
- - Run with `--cpu-only`:
-  ```bash
-  ./run-main-benchmarks.sh --cpu-only
-  ```
- - Run with `--gpu-only`:
-  ```bash
-  ./run-main-benchmarks.sh --gpu-only
-  ```
- - Run without any arguments (all backends):
-  ```bash
-  ./run-main-benchmarks.sh
-  ```
-
-#### 2. `create_results_summary_page.py`
-This script is designed to be run after `run-main-benchmarks.sh` in order to generate an HTML page displaying a results table comparing default NetworkX to nx-cugraph. The script also provides information about the current system, so it should be run on the machine on which benchmarks were run.
-
-**Usage:**
-  ```bash
-  python create_results_summary_page.py > report.html
-  ```
diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
deleted file mode 100644
index 8852ed2a875..00000000000
--- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py
+++ /dev/null
@@ -1,985 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import networkx as nx
-import pandas as pd
-import pytest
-from cugraph import datasets
-import nx_cugraph as nxcg
-
-# Attempt to import the NetworkX dispatching module, which is only needed when
-# testing with NX <3.2 in order to dynamically switch backends. NX >=3.2 allows
-# the backend to be specified directly in the API call.
-try:
-    from networkx.classes import backends  # NX <3.2
-except ImportError:
-    backends = None
-
-
-################################################################################
-# Fixtures and params
-
-# See https://pytest-benchmark.readthedocs.io/en/latest/glossary.html for how
-# these variables are used.
-rounds = 1
-iterations = 1
-warmup_rounds = 1
-
-# FIXME: Add this to cugraph.datasets.  This is done here so these benchmarks
-# can be run without requiring an updated cugraph install.  This temporarily
-# adds a dataset based on an Amazon product co-purchasing network.
-amazon0302_metadata = """
-name: amazon0302
-description:
-  Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003.
-author: J. Leskovec, L. Adamic and B. Adamic
-refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007.
-delim: "\t"
-header: 3
-col_names:
-  - FromNodeId
-  - ToNodeId
-col_types:
-  - int32
-  - int32
-has_loop: false
-is_directed: true
-is_multigraph: false
-is_symmetric: false
-number_of_edges: 1234877
-number_of_nodes: 262111
-url: https://snap.stanford.edu/data/amazon0302.txt.gz
-"""
-amazon0302_metadata_file_name = datasets.default_download_dir.path / "amazon0302.yaml"
-if not amazon0302_metadata_file_name.exists():
-    amazon0302_metadata_file_name.parent.mkdir(parents=True, exist_ok=True)
-    with open(amazon0302_metadata_file_name, "w") as f:
-        f.write(amazon0302_metadata)
-
-amazon0302_dataset = datasets.Dataset(amazon0302_metadata_file_name)
-amazon0302_dataset.metadata["file_type"] = ".gz"
-
-dataset_param_values = [
-    # name: karate, nodes: 34, edges: 156
-    pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]),
-    # name: netscience, nodes: 1461, edges: 5484
-    pytest.param(datasets.netscience, marks=[pytest.mark.small, pytest.mark.directed]),
-    # name: email-Eu-core, nodes: 1005, edges: 25571
-    pytest.param(
-        datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed]
-    ),
-    # name: amazon0302, nodes: 262111, edges: 1234877
-    pytest.param(amazon0302_dataset, marks=[pytest.mark.medium, pytest.mark.directed]),
-    # name: cit-Patents, nodes: 3774768, edges: 16518948
-    pytest.param(
-        datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]
-    ),
-    # name: hollywood, nodes: 1139905, edges: 57515616
-    pytest.param(
-        datasets.hollywood, marks=[pytest.mark.medium, pytest.mark.undirected]
-    ),
-    # name: soc-LiveJournal1, nodes: 4847571, edges: 68993773
-    pytest.param(
-        datasets.soc_livejournal, marks=[pytest.mark.medium, pytest.mark.directed]
-    ),
-    # name: europe_osm, nodes: 50912018, edges: 54054660
-    pytest.param(
-        datasets.europe_osm, marks=[pytest.mark.large, pytest.mark.undirected]
-    ),
-]
-
-backend_param_values = ["cugraph", "cugraph-preconverted", None]
-
-
-def setup_module(module):
-    """
-    Trivial conversion call to force various one-time CUDA initialization
-    operations to happen outside of benchmarks.
-    """
-    G = nx.karate_club_graph()
-    nxcg.from_networkx(G)
-
-
-# Test IDs are generated using the lambda assigned to the ids arg to provide an
-# easier-to-read name. This is especially helpful for Dataset objs (see
-# https://docs.pytest.org/en/stable/reference/reference.html#pytest-fixture)
-@pytest.fixture(
-    scope="module", params=dataset_param_values, ids=lambda ds: f"ds={str(ds)}"
-)
-def graph_obj(request):
-    """
-    Returns a NX Graph or DiGraph obj from the dataset instance parameter.
-    """
-    dataset = request.param
-    return nx_graph_from_dataset(dataset)
-
-
-@pytest.fixture(
-    scope="module",
-    params=backend_param_values,
-    ids=lambda backend: f"backend={backend}",
-)
-def backend(request):
-    """
-    Returns the backend name to use. This is done as a fixture for consistency
-    and simplicity when creating benchmarks (no need to mark the benchmark as
-    parametrized).
-    """
-    return request.param
-
-
-################################################################################
-# Helpers
-def nx_graph_from_dataset(dataset_obj):
-    """
-    Read the dataset specified by the dataset_obj and create and return a
-    nx.Graph or nx.DiGraph instance based on the dataset is_directed metadata.
-    """
-    create_using = nx.DiGraph if dataset_obj.metadata["is_directed"] else nx.Graph
-    names = dataset_obj.metadata["col_names"]
-    pandas_edgelist = dataset_obj.get_edgelist(download=True, reader="pandas")
-    G = nx.from_pandas_edgelist(
-        pandas_edgelist, source=names[0], target=names[1], create_using=create_using
-    )
-    return G
-
-
-def get_legacy_backend_wrapper(backend_name):
-    """
-    Returns a callable that wraps an algo function with either the default
-    dispatcher (which dispatches based on input graph type), or the "testing"
-    dispatcher (which autoconverts and unconditionally dispatches).
-    This is only supported for NetworkX <3.2
-    """
-    backends.plugin_name = "cugraph"
-    orig_dispatch = backends._dispatch
-    testing_dispatch = backends.test_override_dispatch
-
-    if backend_name == "cugraph":
-        dispatch = testing_dispatch
-    else:
-        dispatch = orig_dispatch
-
-    def wrap_callable_for_dispatch(func, exhaust_returned_iterator=False):
-        # Networkx <3.2 registers functions when the dispatch decorator is
-        # applied (called) and errors if re-registered, so clear bookkeeping to
-        # allow it to be called repeatedly.
-        backends._registered_algorithms = {}
-        actual_func = dispatch(func)  # returns the func the dispatcher picks
-
-        def wrapper(*args, **kwargs):
-            retval = actual_func(*args, **kwargs)
-            if exhaust_returned_iterator:
-                retval = list(retval)
-            return retval
-
-        return wrapper
-
-    return wrap_callable_for_dispatch
-
-
-def get_backend_wrapper(backend_name):
-    """
-    Returns a callable that wraps an algo function in order to set the
-    "backend" kwarg on it.
-    This is only supported for NetworkX >= 3.2
-    """
-
-    def wrap_callable_for_dispatch(func, exhaust_returned_iterator=False):
-        def wrapper(*args, **kwargs):
-            kwargs["backend"] = backend_name
-            retval = func(*args, **kwargs)
-            if exhaust_returned_iterator:
-                retval = list(retval)
-            return retval
-
-        return wrapper
-
-    return wrap_callable_for_dispatch
-
-
-@pytest.fixture(
-    scope="module",
-    params=backend_param_values,
-    ids=lambda backend: f"backend={backend}",
-)
-def backend_wrapper(request):
-    """
-    Returns a callable that takes a function algo and wraps it in another
-    function that calls the algo using the appropriate backend.
-
-    For example: if the backend to test is "cugraph", this will return a
-    function that calls nx.pagerank(..., backend='cugraph')
-    """
-    backend_name = request.param
-    actual_backend_name = backend_name
-
-    # Special case: cugraph-preconverted may be specified as a backend but this
-    # name is reserved to indicate a cugraph backend is to be used with a
-    # preconverted graph obj (rather than having the backend do the
-    # conversion).
-    if backend_name == "cugraph-preconverted":
-        actual_backend_name = "cugraph"
-
-    # NX <3.2 does not support the backends= kwarg, so the backend must be
-    # enabled differently
-    if backends is not None:
-        wrapper = get_legacy_backend_wrapper(actual_backend_name)
-    else:
-        wrapper = get_backend_wrapper(actual_backend_name)
-
-    wrapper.backend_name = backend_name
-    return wrapper
-
-
-def get_graph_obj_for_benchmark(graph_obj, backend_wrapper):
-    """
-    Given a Graph object and a backend name, return a converted Graph or the
-    original Graph object based on the backend to use.
-
-    This is needed because some backend names are actually used as descriptions
-    for combinations of backends and converted/non-converted graphs.  For
-    example, a benchmark may specify the "cugraph-preconverted" backend, which
-    is not an installed backend but instead refers to the "cugraph" backend
-    passed a NX Graph that has been converted to a nx-cugraph Graph object.
-    """
-    G = graph_obj
-    if backend_wrapper.backend_name == "cugraph-preconverted":
-        G = nxcg.from_networkx(G, preserve_all_attrs=True)
-    return G
-
-
-def get_highest_degree_node(graph_obj):
-    degrees = graph_obj.degree()  # list of tuples of (node, degree)
-    return max(degrees, key=lambda t: t[1])[0]
-
-
-def build_personalization_dict(pagerank_dict):
-    """
-    Returns a dictionary that can be used as the personalization value for a
-    call to nx.pagerank(). The pagerank_dict passed in is used as the initial
-    source of values for each node, and this function simply treats the list of
-    dict values as two halves (halves A and B) and swaps them so (most if not
-    all) nodes/keys are assigned a different value from the dictionary.
-    """
-    num_half = len(pagerank_dict) // 2
-    A_half_items = list(pagerank_dict.items())[:num_half]
-    B_half_items = list(pagerank_dict.items())[num_half:]
-
-    # Support an odd number of items by initializing with B_half_items, which
-    # will always be one bigger if the number of items is odd. This will leave
-    # the one remainder (in the case of an odd number) unchanged.
-    pers_dict = dict(B_half_items)
-    pers_dict.update({A_half_items[i][0]: B_half_items[i][1] for i in range(num_half)})
-    pers_dict.update({B_half_items[i][0]: A_half_items[i][1] for i in range(num_half)})
-
-    return pers_dict
-
-
-################################################################################
-# Benchmarks
-def bench_from_networkx(benchmark, graph_obj):
-    benchmark(nxcg.from_networkx, graph_obj)
-
-
-# normalized_param_values = [True, False]
-normalized_param_values = [True]
-k_param_values = [10, 20, 50, 100, 500, 1000]
-
-
-@pytest.mark.parametrize(
-    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
-)
-@pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
-def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    if k > G.number_of_nodes():
-        pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
-
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.betweenness_centrality),
-        args=(G,),
-        kwargs=dict(
-            weight=None,
-            normalized=normalized,
-            k=k,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-@pytest.mark.parametrize(
-    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
-)
-@pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
-def bench_edge_betweenness_centrality(
-    benchmark, graph_obj, backend_wrapper, normalized, k
-):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-
-    if k > G.number_of_nodes():
-        pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
-
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.edge_betweenness_centrality),
-        args=(G,),
-        kwargs=dict(
-            weight=None,
-            normalized=normalized,
-            k=k,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_louvain_communities(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    # DiGraphs are not supported
-    if G.is_directed():
-        G = G.to_undirected()
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.community.louvain_communities),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_degree_centrality(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.degree_centrality),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_eigenvector_centrality(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.eigenvector_centrality),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-@pytest.mark.parametrize(
-    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
-)
-def bench_hits(benchmark, graph_obj, backend_wrapper, normalized):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.hits),
-        args=(G,),
-        kwargs=dict(
-            normalized=normalized,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is tuple
-    assert len(result) == 2
-    assert type(result[0]) is dict
-    assert type(result[1]) is dict
-
-
-def bench_in_degree_centrality(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.in_degree_centrality),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-@pytest.mark.parametrize(
-    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
-)
-def bench_katz_centrality(benchmark, graph_obj, backend_wrapper, normalized):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.katz_centrality),
-        args=(G,),
-        kwargs=dict(
-            normalized=normalized,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_k_truss(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    # DiGraphs are not supported
-    if G.is_directed():
-        G = G.to_undirected()
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.k_truss),
-        args=(G,),
-        kwargs=dict(
-            k=2,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    # Check that this at least appears to be some kind of NX-like Graph
-    assert hasattr(result, "has_node")
-
-
-def bench_out_degree_centrality(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.out_degree_centrality),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_pagerank(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.pagerank),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_pagerank_personalized(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-
-    # FIXME: This will run for every combination of inputs, even if the
-    # graph/dataset does not change. Ideally this is run once per
-    # graph/dataset.
-    pagerank_dict = nx.pagerank(G)
-    personalization_dict = build_personalization_dict(pagerank_dict)
-
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.pagerank),
-        args=(G,),
-        kwargs={"personalization": personalization_dict},
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_shortest_path(benchmark, graph_obj, backend_wrapper):
-    """
-    This passes in the source node with the highest degree, but no target.
-    """
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.shortest_path),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_single_source_shortest_path_length(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.single_source_shortest_path_length),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_single_target_shortest_path_length(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(
-            nx.single_target_shortest_path_length, exhaust_returned_iterator=True
-        ),
-        args=(G,),
-        kwargs=dict(
-            target=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    # exhaust_returned_iterator=True forces the result to a list, but is not
-    # needed for this algo in NX 3.3+ which returns a dict instead of an
-    # iterator. Forcing to a list does not change the benchmark timing.
-    assert type(result) is list
-
-
-def bench_ancestors(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.ancestors),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is set
-
-
-def bench_average_clustering(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    # DiGraphs are not supported by nx-cugraph
-    if G.is_directed():
-        G = G.to_undirected()
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.average_clustering),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is float
-
-
-def bench_generic_bfs_edges(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.generic_bfs_edges, exhaust_returned_iterator=True),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_bfs_edges(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.bfs_edges, exhaust_returned_iterator=True),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_bfs_layers(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.bfs_layers, exhaust_returned_iterator=True),
-        args=(G,),
-        kwargs=dict(
-            sources=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_bfs_predecessors(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.bfs_predecessors, exhaust_returned_iterator=True),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_bfs_successors(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.bfs_successors, exhaust_returned_iterator=True),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_bfs_tree(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.bfs_tree),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    # Check that this at least appears to be some kind of NX-like Graph
-    assert hasattr(result, "has_node")
-
-
-def bench_clustering(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    # DiGraphs are not supported by nx-cugraph
-    if G.is_directed():
-        G = G.to_undirected()
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.clustering),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_core_number(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    # DiGraphs are not supported by nx-cugraph
-    if G.is_directed():
-        G = G.to_undirected()
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.core_number),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_descendants(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.descendants),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is set
-
-
-def bench_descendants_at_distance(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.descendants_at_distance),
-        args=(G,),
-        kwargs=dict(
-            source=node,
-            distance=1,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is set
-
-
-def bench_is_bipartite(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.is_bipartite),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is bool
-
-
-def bench_is_strongly_connected(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.is_strongly_connected),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is bool
-
-
-def bench_is_weakly_connected(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.is_weakly_connected),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is bool
-
-
-def bench_number_strongly_connected_components(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.number_strongly_connected_components),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is int
-
-
-def bench_number_weakly_connected_components(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.number_weakly_connected_components),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is int
-
-
-def bench_overall_reciprocity(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.overall_reciprocity),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is float
-
-
-def bench_reciprocity(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.reciprocity),
-        args=(G,),
-        kwargs=dict(
-            nodes=node,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is float
-
-
-def bench_strongly_connected_components(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(
-            nx.strongly_connected_components, exhaust_returned_iterator=True
-        ),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_transitivity(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    # DiGraphs are not supported by nx-cugraph
-    if G.is_directed():
-        G = G.to_undirected()
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.transitivity),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is float
-
-
-def bench_triangles(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    # DiGraphs are not supported
-    if G.is_directed():
-        G = G.to_undirected()
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.triangles),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is dict
-
-
-def bench_weakly_connected_components(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    result = benchmark.pedantic(
-        target=backend_wrapper(
-            nx.weakly_connected_components, exhaust_returned_iterator=True
-        ),
-        args=(G,),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert type(result) is list
-
-
-def bench_ego_graph(benchmark, graph_obj, backend_wrapper):
-    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-    node = get_highest_degree_node(graph_obj)
-    result = benchmark.pedantic(
-        target=backend_wrapper(nx.ego_graph),
-        args=(G,),
-        kwargs=dict(
-            n=node,
-            radius=100,
-        ),
-        rounds=rounds,
-        iterations=iterations,
-        warmup_rounds=warmup_rounds,
-    )
-    assert isinstance(result, (nx.Graph, nxcg.Graph))
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_complete_bipartite_graph(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_connected_components(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_is_connected(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_node_connected_component(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_number_connected_components(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_is_isolate(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_isolates(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_number_of_isolates(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_complement(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_reverse(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_is_arborescence(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_is_branching(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_is_forest(benchmark, graph_obj, backend_wrapper):
-    pass
-
-
-@pytest.mark.skip(reason="benchmark not implemented")
-def bench_is_tree(benchmark, graph_obj, backend_wrapper):
-    pass
diff --git a/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py b/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py
deleted file mode 100644
index e4aff10f0a5..00000000000
--- a/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import re
-import pathlib
-import json
-import platform
-import psutil
-import socket
-import subprocess
-
-
-def get_formatted_time_value(time):
-    res = ""
-    if time < 1:
-        if time < 0.001:
-            units = "us"
-            time *= 1e6
-        else:
-            units = "ms"
-            time *= 1e3
-    else:
-        units = "s"
-    return f"{time:.3f}{units}"
-
-
-def get_all_benchmark_info():
-    benchmarks = {}
-    # Populate benchmarks dir from .json files
-    for json_file in logs_dir.glob("*.json"):
-        try:
-            data = json.loads(open(json_file).read())
-        except json.decoder.JSONDecodeError:
-            continue
-
-        for benchmark_run in data["benchmarks"]:
-            # example name: "bench_triangles[ds=netscience-backend=cugraph-preconverted]"
-            name = benchmark_run["name"]
-
-            algo_name = name.split("[")[0]
-            if algo_name.startswith("bench_"):
-                algo_name = algo_name[6:]
-            # special case for betweenness_centrality
-            match = k_patt.match(name)
-            if match is not None:
-                algo_name += f", k={match.group(1)}"
-
-            match = dataset_patt.match(name)
-            if match is None:
-                raise RuntimeError(
-                    f"benchmark name {name} in file {json_file} has an unexpected format"
-                )
-            dataset = match.group(1)
-            if dataset.endswith("-backend"):
-                dataset = dataset[:-8]
-
-            match = backend_patt.match(name)
-            if match is None:
-                raise RuntimeError(
-                    f"benchmark name {name} in file {json_file} has an unexpected format"
-                )
-            backend = match.group(1)
-            if backend == "None":
-                backend = "networkx"
-
-            runtime = benchmark_run["stats"]["mean"]
-            benchmarks.setdefault(algo_name, {}).setdefault(backend, {})[
-                dataset
-            ] = runtime
-    return benchmarks
-
-
-def compute_perf_vals(cugraph_runtime, networkx_runtime):
-    speedup_string = f"{networkx_runtime / cugraph_runtime:.3f}X"
-    delta = networkx_runtime - cugraph_runtime
-    if abs(delta) < 1:
-        if abs(delta) < 0.001:
-            units = "us"
-            delta *= 1e6
-        else:
-            units = "ms"
-            delta *= 1e3
-    else:
-        units = "s"
-    delta_string = f"{delta:.3f}{units}"
-
-    return (speedup_string, delta_string)
-
-
-def get_mem_info():
-    return round(psutil.virtual_memory().total / (1024**3), 2)
-
-
-def get_cuda_version():
-    output = subprocess.check_output("nvidia-smi", shell=True).decode()
-    try:
-        return next(
-            line.split("CUDA Version: ")[1].split()[0]
-            for line in output.splitlines()
-            if "CUDA Version" in line
-        )
-    except subprocess.CalledProcessError:
-        return "Failed to get CUDA version."
-
-
-def get_first_gpu_info():
-    try:
-        gpu_info = (
-            subprocess.check_output(
-                "nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv,noheader",
-                shell=True,
-            )
-            .decode()
-            .strip()
-        )
-        if gpu_info:
-            gpus = gpu_info.split("\n")
-            num_gpus = len(gpus)
-            first_gpu = gpus[0]  # Get the information for the first GPU
-            gpu_name, mem_total, _, _ = first_gpu.split(",")
-            return f"{num_gpus} x {gpu_name.strip()} ({round(int(mem_total.strip().split()[0]) / (1024), 2)} GB)"
-        else:
-            print("No GPU found or unable to query GPU details.")
-    except subprocess.CalledProcessError:
-        print("Failed to execute nvidia-smi. No GPU information available.")
-
-
-def get_system_info():
-    print('<div class="box2">')
-    print(f"<p>Hostname: {socket.gethostname()}</p>")
-    print(
-        f'<p class="indent"">Operating System: {platform.system()} {platform.release()}</p>'
-    )
-    print(f'<p class="indent">Kernel Version  : {platform.version()}</p>')
-    with open("/proc/cpuinfo") as f:
-        print(
-            f'<p>CPU: {next(line.strip().split(": ")[1] for line in f if "model name" in line)} ({psutil.cpu_count(logical=False)} cores)</p>'
-        )
-    print(f'<p class="indent">Memory: {get_mem_info()} GB</p>')
-    print(f"<p>GPU: {get_first_gpu_info()}</p>")
-    print(f"<p>CUDA Version: {get_cuda_version()}</p>")
-
-
-if __name__ == "__main__":
-    logs_dir = pathlib.Path("logs")
-
-    dataset_patt = re.compile(".*ds=([\w-]+).*")
-    backend_patt = re.compile(".*backend=(\w+).*")
-    k_patt = re.compile(".*k=(\d+).*")
-
-    # Organize all benchmark runs by the following hierarchy: algo -> backend -> dataset
-    benchmarks = get_all_benchmark_info()
-
-    # dump HTML table
-    ordered_datasets = [
-        "netscience",
-        "email_Eu_core",
-        "amazon0302",
-        "cit-patents",
-        "hollywood",
-        "soc-livejournal1",
-    ]
-    # dataset, # Node, # Edge, Directed info
-    dataset_meta = {
-        "netscience": ["1,461", "5,484", "Yes"],
-        "email_Eu_core": ["1,005", "25,571", "Yes"],
-        "amazon0302": ["262,111", "1,234,877", "Yes"],
-        "cit-patents": ["3,774,768", "16,518,948", "Yes"],
-        "hollywood": ["1,139,905", "57,515,616", "No"],
-        "soc-livejournal1": ["4,847,571", "68,993,773", "Yes"],
-    }
-
-    print(
-        """
-    <html>
-    <head>
-    <style>
-        table {
-            table-layout: fixed;
-            width: 100%;
-            border-collapse: collapse;
-        }
-        tbody tr:nth-child(odd) {
-            background-color: #ffffff;
-        }
-        tbody tr:nth-child(even) {
-            background-color: #d3d3d3;
-        }
-        tbody td {
-            text-align: center;
-            color: black;
-        }
-        th,
-        td {
-            padding: 12px;
-        }
-        .footer-main {
-            background-color: #d1d1d1;
-            padding: 20px;
-            padding-top: 0px;
-            font-size: 12px;
-            color: black;
-            width: 100%;
-            display: flex;
-        }
-        .box1{
-            flex: 1;
-            padding-right: 30px;
-        }
-        .box2{
-            flex: 4;
-        }
-        .indent {
-            text-indent: 20px;
-        }
-    </style>
-    </head>
-    <table>
-    <thead>
-    <tr>
-        <th>Dataset<br>Nodes<br>Edges<Br>Directed</th>"""
-    )
-    for ds in ordered_datasets:
-        print(
-            f"      <th>{ds}<br>{dataset_meta[ds][0]}<br>{dataset_meta[ds][1]}<br>{dataset_meta[ds][2]}<br></th>"
-        )
-    print(
-        """   </tr>
-    </thead>
-    <tbody>
-    """
-    )
-    for algo_name in sorted(benchmarks):
-        algo_runs = benchmarks[algo_name]
-        print("   <tr>")
-        print(f"      <td>{algo_name}</td>")
-        # Proceed only if any results are present for both cugraph and NX
-        if "cugraph" in algo_runs and "networkx" in algo_runs:
-            cugraph_algo_runs = algo_runs["cugraph"]
-            networkx_algo_runs = algo_runs["networkx"]
-            datasets_in_both = set(cugraph_algo_runs).intersection(networkx_algo_runs)
-
-            # populate the table with speedup results for each dataset in the order
-            # specified in ordered_datasets. If results for a run using a dataset
-            # are not present for both cugraph and NX, output an empty cell.
-            for dataset in ordered_datasets:
-                if dataset in datasets_in_both:
-                    cugraph_runtime = cugraph_algo_runs[dataset]
-                    networkx_runtime = networkx_algo_runs[dataset]
-                    (speedup, runtime_delta) = compute_perf_vals(
-                        cugraph_runtime=cugraph_runtime,
-                        networkx_runtime=networkx_runtime,
-                    )
-                    nx_formatted = get_formatted_time_value(networkx_runtime)
-                    cg_formatted = get_formatted_time_value(cugraph_runtime)
-                    print(
-                        f"      <td>{nx_formatted} / {cg_formatted}<br>{speedup}<br>{runtime_delta}</td>"
-                    )
-                else:
-                    print(f"      <td></td>")
-
-        # If a comparison between cugraph and NX cannot be made, output empty cells
-        # for each dataset
-        else:
-            for _ in range(len(ordered_datasets)):
-                print("      <td></td>")
-        print("   </tr>")
-    print(
-        """
-    </tbody>\n</table>
-    <div class="footer-main">
-        <div class="box1">
-            <h4>Table Format:</h4>
-            <ul>
-                <li><strong>NetworkX time / nx-cugraph time</strong></li>
-                <li><strong>Speed-up of using nx-cugraph</strong></li>
-                <li><strong>Time-delta</strong></li>
-            </ul>
-        </div>"""
-    )
-    get_system_info()
-    print("""</div>\n</div>\n</html>""")
diff --git a/benchmarks/nx-cugraph/pytest-based/run-2402.sh b/benchmarks/nx-cugraph/pytest-based/run-2402.sh
deleted file mode 100755
index 44ed0bda43a..00000000000
--- a/benchmarks/nx-cugraph/pytest-based/run-2402.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Runs benchmarks for the 24.02 algos.
-# Pass either a or b or both. This is useful for separating batches of runs on different GPUs:
-# CUDA_VISIBLE_DEVICES=1 run-2402.sh b
-
-mkdir -p logs
-
-# benches="$benches ..." pattern is easy to comment out individual runs
-benches=
-
-while [[ $1 != "" ]]; do
-    if [[ $1 == "a" ]]; then
-        benches="$benches bench_ancestors"
-        benches="$benches bench_average_clustering"
-        benches="$benches bench_generic_bfs_edges"
-        benches="$benches bench_bfs_edges"
-        benches="$benches bench_bfs_layers"
-        benches="$benches bench_bfs_predecessors"
-        benches="$benches bench_bfs_successors"
-        benches="$benches bench_bfs_tree"
-        benches="$benches bench_clustering"
-        benches="$benches bench_core_number"
-        benches="$benches bench_descendants"
-    elif [[ $1 == "b" ]]; then
-        benches="$benches bench_descendants_at_distance"
-        benches="$benches bench_is_bipartite"
-        benches="$benches bench_is_strongly_connected"
-        benches="$benches bench_is_weakly_connected"
-        benches="$benches bench_number_strongly_connected_components"
-        benches="$benches bench_number_weakly_connected_components"
-        benches="$benches bench_overall_reciprocity"
-        benches="$benches bench_reciprocity"
-        benches="$benches bench_strongly_connected_components"
-        benches="$benches bench_transitivity"
-        benches="$benches bench_triangles"
-        benches="$benches bench_weakly_connected_components"
-    fi
-    shift
-done
-
-for bench in $benches; do
-    pytest -sv -k "soc-livejournal1" "bench_algos.py::$bench" 2>&1 | tee "logs/${bench}.log"
-done
diff --git a/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh b/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
deleted file mode 100755
index 73c85000b0f..00000000000
--- a/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# location to store datasets used for benchmarking
-export RAPIDS_DATASET_ROOT_DIR=${RAPIDS_DATASET_ROOT_DIR:-/datasets/cugraph}
-mkdir -p logs
-
-# list of algos, datasets, and back-ends to use in combinations
-algos="
-    pagerank
-    betweenness_centrality
-    louvain
-    shortest_path
-    weakly_connected_components
-    triangles
-    bfs_predecessors
-"
-datasets="
-   netscience
-   email_Eu_core
-   amazon0302
-   cit-patents
-   hollywood
-   soc-livejournal
-"
-# None backend is default networkx
-# cugraph-preconvert backend is nx-cugraph
-backends="
-    None
-    cugraph-preconverted
-"
-
-# edit this directly to for pytest
-# e.g. -k "and not 100 and not 1000"
-bc_k_values=""
-
-# check for --cpu-only or --gpu-only args
-if [[ "$#" -eq 1 ]]; then
-    case $1 in
-        --cpu-only)
-            backends="None"
-            ;;
-        --gpu-only)
-            backends="cugraph-preconverted"
-            ;;
-        *)
-            echo "Unknown option: $1"
-            exit 1
-            ;;
-    esac
-fi
-
-for algo in $algos; do
-    for dataset in $datasets; do
-        for backend in $backends; do
-            name="${backend}__${algo}__${dataset}"
-            echo "Running: $backend, $dataset, bench_$algo"
-
-            # uncomment to get command for reproducing test
-            # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo $bc_k_values\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
-
-            pytest -sv \
-                -k "$backend and $dataset and bench_$algo $bc_k_values" \
-                --benchmark-json="logs/${name}.json" \
-                bench_algos.py 2>&1 | tee "logs/${name}.out"
-        done
-    done
-done
diff --git a/build.sh b/build.sh
index 29abd48166a..756045461dd 100755
--- a/build.sh
+++ b/build.sh
@@ -29,10 +29,6 @@ VALIDARGS="
    pylibcugraph
    cugraph
    cugraph-service
-   cugraph-pyg
-   cugraph-dgl
-   cugraph-equivariant
-   nx-cugraph
    cpp-mgtests
    cpp-mtmgtests
    docs
@@ -58,10 +54,6 @@ HELP="$0 [<target> ...] [<flag> ...]
    pylibcugraph               - build the pylibcugraph Python package
    cugraph                    - build the cugraph Python package
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
-   cugraph-pyg                - build the cugraph-pyg Python package
-   cugraph-dgl                - build the cugraph-dgl extensions for DGL
-   cugraph-equivariant        - build the cugraph-equivariant Python package
-   nx-cugraph                 - build the nx-cugraph Python package
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cpp-mtmgtests              - build libcugraph MTMG tests. Adds UCX as a dependency (temporary).
    docs                       - build the docs
@@ -88,12 +80,10 @@ LIBCUGRAPH_ETL_BUILD_DIR=${LIBCUGRAPH_ETL_BUILD_DIR:=${REPODIR}/cpp/libcugraph_e
 CUGRAPH_SERVICE_BUILD_DIRS="${REPODIR}/python/cugraph-service/server/build
                             ${REPODIR}/python/cugraph-service/client/build
 "
-CUGRAPH_DGL_BUILD_DIR=${REPODIR}/python/cugraph-dgl/build
 
 BUILD_DIRS="${LIBCUGRAPH_BUILD_DIR}
             ${LIBCUGRAPH_ETL_BUILD_DIR}
             ${CUGRAPH_SERVICE_BUILD_DIRS}
-            ${CUGRAPH_DGL_BUILD_DIR}
 "
 
 # Set defaults for vars modified by flags to this script
@@ -211,8 +201,7 @@ if hasArg uninstall; then
     # FIXME: if multiple versions of these packages are installed, this only
     # removes the latest one and leaves the others installed. build.sh uninstall
     # can be run multiple times to remove all of them, but that is not obvious.
-    pip uninstall -y pylibcugraph cugraph cugraph-service-client cugraph-service-server \
-        cugraph-dgl cugraph-pyg cugraph-equivariant nx-cugraph
+    pip uninstall -y pylibcugraph cugraph cugraph-service-client cugraph-service-server
 fi
 
 if hasArg clean; then
@@ -330,42 +319,6 @@ if hasArg cugraph-service || hasArg all; then
     fi
 fi
 
-# Build and install the cugraph-pyg Python package
-if hasArg cugraph-pyg || hasArg all; then
-    if hasArg --clean; then
-        cleanPythonDir ${REPODIR}/python/cugraph-pyg
-    else
-        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-pyg
-    fi
-fi
-
-# Install the cugraph-dgl extensions for DGL
-if hasArg cugraph-dgl || hasArg all; then
-    if hasArg --clean; then
-        cleanPythonDir ${REPODIR}/python/cugraph-dgl
-    else
-        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-dgl
-    fi
-fi
-
-# Build and install the cugraph-equivariant Python package
-if hasArg cugraph-equivariant || hasArg all; then
-    if hasArg --clean; then
-        cleanPythonDir ${REPODIR}/python/cugraph-equivariant
-    else
-        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-equivariant
-    fi
-fi
-
-# Build and install the nx-cugraph Python package
-if hasArg nx-cugraph || hasArg all; then
-    if hasArg --clean; then
-        cleanPythonDir ${REPODIR}/python/nx-cugraph
-    else
-        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/nx-cugraph
-    fi
-fi
-
 # Build the docs
 if hasArg docs || hasArg all; then
     if [ ! -d ${LIBCUGRAPH_BUILD_DIR} ]; then
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 01c573c96ca..2d7e90da8d0 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -48,7 +48,7 @@ rapids-mamba-retry install \
   "libcugraph_etl=${RAPIDS_VERSION_MAJOR_MINOR}.*" \
   "pylibcugraphops=${RAPIDS_VERSION_MAJOR_MINOR}.*" \
   "pylibwholegraph=${RAPIDS_VERSION_MAJOR_MINOR}.*" \
-  "pytorch>=2.3,<2.4" \
+  'pytorch>=2.3' \
   "cuda-version=${CONDA_CUDA_VERSION}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 9a8f1227488..eab41f63da0 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -50,9 +50,4 @@ rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cugraph-service
 
-rapids-conda-retry mambabuild \
-  --no-test \
-  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cugraph-equivariant
-
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 3c89d63538c..9a77e6b3021 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -29,27 +29,22 @@ python -m pip wheel \
 
 sccache --show-adv-stats
 
-# pure-python packages should be marked as pure, and not have auditwheel run on them.
-if [[ ${package_name} == "cugraph-equivariant" ]]; then
-    RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python dist
-else
-    case "${RAPIDS_CUDA_VERSION}" in
-        12.*)
-            EXCLUDE_ARGS=(
-                --exclude "libcublas.so.12"
-                --exclude "libcublasLt.so.12"
-                --exclude "libcurand.so.10"
-                --exclude "libcusolver.so.11"
-                --exclude "libcusparse.so.12"
-                --exclude "libnvJitLink.so.12"
-            )
-        ;;
-        11.*)
-            EXCLUDE_ARGS=()
-        ;;
-    esac
-
-    mkdir -p final_dist
-    python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
-    RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
-fi
+case "${RAPIDS_CUDA_VERSION}" in
+    12.*)
+        EXCLUDE_ARGS=(
+            --exclude "libcublas.so.12"
+            --exclude "libcublasLt.so.12"
+            --exclude "libcurand.so.10"
+            --exclude "libcusolver.so.11"
+            --exclude "libcusparse.so.12"
+            --exclude "libnvJitLink.so.12"
+        )
+    ;;
+    11.*)
+        EXCLUDE_ARGS=()
+    ;;
+esac
+
+mkdir -p final_dist
+python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
diff --git a/ci/build_wheel_cugraph-equivariant.sh b/ci/build_wheel_cugraph-equivariant.sh
deleted file mode 100755
index 2f270422f84..00000000000
--- a/ci/build_wheel_cugraph-equivariant.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-package_dir="python/cugraph-equivariant"
-
-./ci/build_wheel.sh cugraph-equivariant ${package_dir}
-./ci/validate_wheel.sh ${package_dir} dist
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index be1988e31dd..961f7816caa 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -51,8 +51,6 @@ NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; p
 DEPENDENCIES=(
   cudf
   cugraph
-  cugraph-dgl
-  cugraph-pyg
   cugraph-service-server
   cugraph-service-client
   cuxfilter
@@ -75,7 +73,7 @@ DEPENDENCIES=(
 UCXX_DEPENDENCIES=(
   ucx-py
 )
-for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
+for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
diff --git a/ci/run_cugraph_dgl_pytests.sh b/ci/run_cugraph_dgl_pytests.sh
deleted file mode 100755
index 83c26a57dc0..00000000000
--- a/ci/run_cugraph_dgl_pytests.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Support invoking run_cugraph_dgl_pytests.sh outside the script directory
-cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-dgl/tests
-
-pytest --cache-clear --ignore=mg "$@" .
diff --git a/ci/run_cugraph_equivariant_pytests.sh b/ci/run_cugraph_equivariant_pytests.sh
deleted file mode 100755
index 5d5a5fb05c2..00000000000
--- a/ci/run_cugraph_equivariant_pytests.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Support invoking run_cugraph_equivariant_pytests.sh outside the script directory
-cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-equivariant/cugraph_equivariant
-
-pytest --cache-clear "$@" .
diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh
deleted file mode 100755
index fb27f16d79e..00000000000
--- a/ci/run_cugraph_pyg_pytests.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Support invoking run_cugraph_pyg_pytests.sh outside the script directory
-cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg
-
-pytest --cache-clear --benchmark-disable "$@" .
-
-# Used to skip certain examples in CI due to memory limitations
-export CI_RUN=1
-
-# Test examples
-for e in "$(pwd)"/examples/*.py; do
-  rapids-logger "running example $e"
-  (yes || true) | python $e
-done
diff --git a/ci/test.sh b/ci/test.sh
index 884ed7ac881..8e19b6c8c18 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -99,13 +99,6 @@ if hasArg "--run-python-tests"; then
     pytest -sv -m sg -m "managedmem_on and poolallocator_on and tiny" --benchmark-disable
     echo "Ran Python benchmarks for cuGraph (running as tests) : return code was: $?, test script exit code is now: $EXITCODE"
 
-    echo "Python pytest for cugraph_pyg (single-GPU only)..."
-    conda list
-    cd ${CUGRAPH_ROOT}/python/cugraph-pyg/cugraph_pyg
-    # rmat is not tested because of MG testing
-    pytest -sv -m sg --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --benchmark-disable
-    echo "Ran Python pytest for cugraph_pyg : return code was: $?, test script exit code is now: $EXITCODE"
-
     echo "Python pytest for cugraph-service (single-GPU only)..."
     cd ${CUGRAPH_ROOT}/python/cugraph-service
     pytest -sv --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-service-pytests.xml --benchmark-disable -k "not mg" ./tests
diff --git a/ci/test_python.sh b/ci/test_python.sh
index a3a177dcfc6..646b61805cc 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -100,42 +100,5 @@ rapids-logger "pytest cugraph-service (single GPU)"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-service-coverage.xml" \
   --cov-report=term
 
-# test cugraph-equivariant
-if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
-  if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
-    rapids-mamba-retry env create --yes -f env.yaml -n test_cugraph_equivariant
-    set +u
-    conda activate test_cugraph_equivariant
-    set -u
-    rapids-mamba-retry install \
-      --channel "${CPP_CHANNEL}" \
-      --channel "${PYTHON_CHANNEL}" \
-      --channel conda-forge \
-      --channel nvidia \
-      "cugraph-equivariant=${RAPIDS_VERSION_MAJOR_MINOR}.*"
-    pip install e3nn==0.5.1
-
-    rapids-print-env
-
-    rapids-logger "pytest cugraph-equivariant"
-    ./ci/run_cugraph_equivariant_pytests.sh \
-      --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-equivariant.xml" \
-      --cov-config=../../.coveragerc \
-      --cov=cugraph_equivariant \
-      --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-equivariant-coverage.xml" \
-      --cov-report=term
-
-    # Reactivate the test environment back
-    set +u
-    conda deactivate
-    conda activate test
-    set -u
-  else
-    rapids-logger "skipping cugraph-equivariant pytest on ARM64"
-  fi
-else
-  rapids-logger "skipping cugraph-equivariant pytest on CUDA!=11.8"
-fi
-
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_wheel_cugraph-equivariant.sh b/ci/test_wheel_cugraph-equivariant.sh
deleted file mode 100755
index 3be1d578964..00000000000
--- a/ci/test_wheel_cugraph-equivariant.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -eoxu pipefail
-
-package_name="cugraph-equivariant"
-
-mkdir -p ./dist
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-# Download the cugraph-equivariant built in the previous step
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
-
-# determine pytorch source
-PKG_CUDA_VER="$(echo ${CUDA_VERSION} | cut -d '.' -f1,2 | tr -d '.')"
-PKG_CUDA_VER_MAJOR=${PKG_CUDA_VER:0:2}
-if [[ "${PKG_CUDA_VER_MAJOR}" == "12" ]]; then
-  PYTORCH_CUDA_VER="121"
-else
-  PYTORCH_CUDA_VER=$PKG_CUDA_VER
-fi
-PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
-
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install \
-    -v \
-    --extra-index-url "${PYTORCH_URL}" \
-    "$(echo ./dist/cugraph_equivariant_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
-    'e3nn' \
-    'torch>=2.3.0,<2.4'
-
-python -m pytest python/cugraph-equivariant/cugraph_equivariant/tests
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 62e8aa355c3..5275d608440 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- dglteam/label/th23_cu118
 - conda-forge
 - nvidia
 dependencies:
@@ -34,7 +33,6 @@ dependencies:
 - nbsphinx
 - nccl>=2.19
 - networkx>=2.5.1
-- networkx>=3.0
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
@@ -53,10 +51,9 @@ dependencies:
 - pytest
 - pytest-benchmark
 - pytest-cov
-- pytest-mpl
 - pytest-xdist
 - python-louvain
-- pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - raft-dask==25.2.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
 - rapids-dask-dependency==25.2.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index f5345708dc8..33c9be9cdda 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- dglteam/label/th23_cu118
 - conda-forge
 - nvidia
 dependencies:
@@ -40,7 +39,6 @@ dependencies:
 - nbsphinx
 - nccl>=2.19
 - networkx>=2.5.1
-- networkx>=3.0
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
@@ -58,10 +56,9 @@ dependencies:
 - pytest
 - pytest-benchmark
 - pytest-cov
-- pytest-mpl
 - pytest-xdist
 - python-louvain
-- pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - raft-dask==25.2.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
 - rapids-dask-dependency==25.2.*,>=0.0.0a0
diff --git a/conda/recipes/cugraph-dgl/build.sh b/conda/recipes/cugraph-dgl/build.sh
deleted file mode 100644
index 14d29b7eab9..00000000000
--- a/conda/recipes/cugraph-dgl/build.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-
-./build.sh cugraph-dgl
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
deleted file mode 100644
index 0383fc8adf8..00000000000
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cugraph-dgl
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-
-requirements:
-  host:
-    - python
-    - rapids-build-backend>=0.3.1,<0.4.0.dev0
-    - setuptools>=61.0.0
-  run:
-    - cugraph ={{ version }}
-    - dgl >=2.4.0.th23.cu*
-    - numba >=0.57
-    - numpy >=1.23,<3.0a0
-    - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2
-    - python
-    - pytorch >=2.3,<2.4.0a0
-    - cupy >=12.0.0
-
-tests:
-  imports:
-    - cugraph_dgl
-
-about:
-  home: https://rapids.ai/
-  dev_url: https://github.com/rapidsai/cugraph
-  license: Apache-2.0
-  license_file: ../../../LICENSE
-  summary: cuGraph library
diff --git a/conda/recipes/cugraph-equivariant/build.sh b/conda/recipes/cugraph-equivariant/build.sh
deleted file mode 100644
index f0ff1688b55..00000000000
--- a/conda/recipes/cugraph-equivariant/build.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-
-./build.sh cugraph-equivariant
diff --git a/conda/recipes/cugraph-equivariant/meta.yaml b/conda/recipes/cugraph-equivariant/meta.yaml
deleted file mode 100644
index 9dc9d51fa48..00000000000
--- a/conda/recipes/cugraph-equivariant/meta.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cugraph-equivariant
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-
-requirements:
-  host:
-    - python
-    - rapids-build-backend>=0.3.1,<0.4.0.dev0
-    - setuptools>=61.0.0
-  run:
-    - pylibcugraphops ={{ minor_version }}
-    - python
-
-tests:
-  imports:
-    - cugraph_equivariant
-
-about:
-  home: https://rapids.ai/
-  dev_url: https://github.com/rapidsai/cugraph
-  license: Apache-2.0
-  license_file: ../../../LICENSE
-  summary: GPU-accelerated equivariant convolutional layers.
diff --git a/conda/recipes/cugraph-pyg/build.sh b/conda/recipes/cugraph-pyg/build.sh
deleted file mode 100644
index ad2502985e5..00000000000
--- a/conda/recipes/cugraph-pyg/build.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-./build.sh cugraph-pyg --allgpuarch
diff --git a/conda/recipes/cugraph-pyg/conda_build_config.yaml b/conda/recipes/cugraph-pyg/conda_build_config.yaml
deleted file mode 100644
index 47d98b4800b..00000000000
--- a/conda/recipes/cugraph-pyg/conda_build_config.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-c_compiler_version:
-  - 11
-
-cxx_compiler_version:
-  - 11
-
-cuda_compiler:
-  - nvcc
-
-cmake_version:
-  - ">=3.26.4,!=3.30.0"
-
-c_stdlib:
-  - sysroot
-
-c_stdlib_version:
-  - "2.17"
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
deleted file mode 100644
index 7d3e503e23a..00000000000
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cugraph-pyg
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - PARALLEL_LEVEL
-
-requirements:
-  build:
-    - {{ stdlib("c") }}
-  host:
-    - cython >=3.0.0
-    - python
-    - rapids-build-backend>=0.3.1,<0.4.0.dev0
-    - setuptools>=61.0.0
-  run:
-    - rapids-dask-dependency ={{ minor_version }}
-    - numba >=0.57
-    - numpy >=1.23,<3.0a0
-    - python
-    - pytorch >=2.3,<2.4.0a0
-    - cupy >=12.0.0
-    - cugraph ={{ version }}
-    - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2
-    - pytorch_geometric >=2.5,<2.6
-
-tests:
-  imports:
-    - cugraph_pyg
-
-about:
-  home: https://rapids.ai/
-  dev_url: https://github.com/rapidsai/cugraph
-  license: Apache-2.0
-  license_file: ../../../LICENSE
-  summary: cuGraph-pyg library
diff --git a/conda/recipes/nx-cugraph/build.sh b/conda/recipes/nx-cugraph/build.sh
deleted file mode 100644
index 26665c1e76a..00000000000
--- a/conda/recipes/nx-cugraph/build.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-
-./build.sh nx-cugraph
diff --git a/conda/recipes/nx-cugraph/meta.yaml b/conda/recipes/nx-cugraph/meta.yaml
deleted file mode 100644
index 263f53d9a8f..00000000000
--- a/conda/recipes/nx-cugraph/meta.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: nx-cugraph
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-
-requirements:
-  host:
-    - python
-    - rapids-build-backend>=0.3.1,<0.4.0.dev0
-    - setuptools>=61.0.0
-  run:
-    - pylibcugraph ={{ version }}
-    - networkx >=3.0
-    - cupy >=12.0.0
-    - python
-
-tests:
-  imports:
-    - nx_cugraph
-  commands:
-    - pip check
-  requires:
-    - pip
-
-about:
-  home: https://rapids.ai/
-  dev_url: https://github.com/rapidsai/cugraph
-  license: Apache-2.0
-  license_file: ../../../LICENSE
-  summary: cuGraph backend for GPU-accelerated NetworkX
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2cea2e504ab..65772b4f5dd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -167,6 +167,7 @@ set(CUGRAPH_SOURCES
     src/detail/groupby_and_count_mg_v64_e64.cu
     src/detail/collect_comm_wrapper_mg_v32_e32.cu
     src/detail/collect_comm_wrapper_mg_v64_e64.cu
+    src/sampling/detail/conversion_utilities.cu
     src/sampling/random_walks_mg_v64_e64.cu
     src/sampling/random_walks_mg_v32_e32.cu
     src/community/detail/common_methods_mg_v64_e64.cu
@@ -264,10 +265,10 @@ set(CUGRAPH_SOURCES
     src/sampling/detail/sample_edges_mg_v32_e32.cu
     src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
     src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
-    src/sampling/neighbor_sampling_mg_v32_e32.cpp
-    src/sampling/neighbor_sampling_mg_v64_e64.cpp
-    src/sampling/neighbor_sampling_sg_v32_e32.cpp
-    src/sampling/neighbor_sampling_sg_v64_e64.cpp
+    src/sampling/neighbor_sampling_mg_v32_e32.cu
+    src/sampling/neighbor_sampling_mg_v64_e64.cu
+    src/sampling/neighbor_sampling_sg_v32_e32.cu
+    src/sampling/neighbor_sampling_sg_v64_e64.cu
     src/sampling/negative_sampling_sg_v32_e32.cu
     src/sampling/negative_sampling_sg_v64_e64.cu
     src/sampling/negative_sampling_mg_v32_e32.cu
@@ -537,6 +538,8 @@ add_library(cugraph_c
         src/c_api/weakly_connected_components.cpp
         src/c_api/strongly_connected_components.cpp
         src/c_api/allgather.cpp
+        src/c_api/decompress_to_edgelist.cpp
+        src/c_api/edgelist.cpp
         )
 add_library(cugraph::cugraph_c ALIAS cugraph_c)
 
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
index 3d99b85556b..b1afeafd66b 100644
--- a/cpp/include/cugraph/detail/utility_wrappers.hpp
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -65,6 +65,48 @@ void uniform_random_fill(rmm::cuda_stream_view const& stream_view,
 template <typename value_t>
 void scalar_fill(raft::handle_t const& handle, value_t* d_value, size_t size, value_t value);
 
+/**
+ * @brief    Sort a device span
+ *
+ * @tparam      value_t      type of the value to operate on. Must be either int32_t or int64_t.
+ *
+ * @param [in]  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[out]  values      device span to sort
+ *
+ */
+template <typename value_t>
+void sort_ints(raft::handle_t const& handle, raft::device_span<value_t> values);
+
+/**
+ * @brief    Keep unique element from a device span
+ *
+ * @tparam      value_t      type of the value to operate on. Must be either int32_t or int64_t.
+ *
+ * @param [in]  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[in]  values      device span of unique elements.
+ * @return the number of unique elements.
+ *
+ */
+template <typename value_t>
+size_t unique_ints(raft::handle_t const& handle, raft::device_span<value_t> values);
+
+/**
+ * @brief    Increment the values of a device span by a constant value
+ *
+ * @tparam      value_t      type of the value to operate on. Must be either int32_t or int64_t.
+ *
+ * @param[out]  values       device span to update
+ * @param[in]   value        value to be added to each element of the buffer
+ * @param[in]   stream_view  stream view
+ *
+ */
+template <typename value_t>
+void transform_increment_ints(raft::device_span<value_t> values,
+                              value_t value,
+                              rmm::cuda_stream_view const& stream_view);
+
 /**
  * @brief    Fill a buffer with a sequence of values
  *
@@ -73,7 +115,7 @@ void scalar_fill(raft::handle_t const& handle, value_t* d_value, size_t size, va
  *
  * Similar to the function std::iota, wraps the function thrust::sequence
  *
- * @tparam      value_t      type of the value to operate on
+ * @tparam      value_t      type of the value to operate on.
  *
  * @param[in]   stream_view  stream view
  * @param[out]  d_value      device array to fill
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index 783cd3a7e2b..3d41e954416 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -43,6 +43,8 @@ enum class prior_sources_behavior_t { DEFAULT = 0, CARRY_OVER, EXCLUDE };
 /**
  * @brief Uniform Neighborhood Sampling.
  *
+ * @deprecated Replaced with homogeneous_uniform_neighbor_sample
+ *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
  * randomly selects from these outgoing neighbors to extract a subgraph.
  *
@@ -53,19 +55,20 @@ enum class prior_sources_behavior_t { DEFAULT = 0, CARRY_OVER, EXCLUDE };
  * encountered in.  The label output (optional) identifes the vertex label.  The offsets array
  * (optional) will be described below and is dependent upon the input parameters.
  *
- * If @p starting_vertex_labels is not specified then no organization is applied to the output, the
- * label and offsets values in the return set will be std::nullopt.
+ * If @p starting_vertex_label_offsets is not specified then no organization is applied to the
+ * output, the label and offsets values in the return set will be std::nullopt.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is not specified then
- * the label output has values.  This will also result in the output being sorted by vertex label.
- * The offsets array in the return will be a CSR-style offsets array to identify the beginning of
- * each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is not
+ * specified then the label output has values.  This will also result in the output being sorted by
+ * vertex label. The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is specified then the
- * label output has values.  This will also result in the output being sorted by vertex label.  The
- * offsets array in the return will be a CSR-style offsets array to identify the beginning of each
- * label range in the data.  `labels.size() == (offsets.size() - 1)`.  Additionally, the data will
- * be shuffled so that all data with a particular label will be on the specified rank.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is specified
+ * then the label output has values.  This will also result in the output being sorted by vertex
+ * label.  The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * Additionally, the data will be shuffled so that all data with a particular label will be on the
+ * specified rank.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
@@ -83,8 +86,8 @@ enum class prior_sources_behavior_t { DEFAULT = 0, CARRY_OVER, EXCLUDE };
  * @param edge_type_view Optional view object holding edge types for @p graph_view.
  * @param starting_vertices Device span of starting vertex IDs for the sampling.
  * In a multi-gpu context the starting vertices should be local to this GPU.
- * @param starting_vertex_labels Optional device span of labels associted with each starting vertex
- * for the sampling.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
  * @param label_to_output_comm_rank Optional tuple of device spans mapping label to a particular
  * output rank.  Element 0 of the tuple identifes the label, Element 1 of the tuple identifies the
  * output rank.  The label span must be sorted in ascending order.
@@ -126,7 +129,7 @@ uniform_neighbor_sample(
   std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
   raft::device_span<vertex_t const> starting_vertices,
-  std::optional<raft::device_span<label_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<label_t const>> starting_vertex_label_offsets,
   std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
     label_to_output_comm_rank,
   raft::host_span<int32_t const> fan_out,
@@ -140,6 +143,8 @@ uniform_neighbor_sample(
 /**
  * @brief Biased Neighborhood Sampling.
  *
+ * @deprecated Replaced with homogeneous_biased_neighbor_sample
+ *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
  * randomly selects (with edge biases) from these outgoing neighbors to extract a subgraph.
  *
@@ -150,24 +155,26 @@ uniform_neighbor_sample(
  * encountered in.  The label output (optional) identifes the vertex label.  The offsets array
  * (optional) will be described below and is dependent upon the input parameters.
  *
- * If @p starting_vertex_labels is not specified then no organization is applied to the output, the
- * label and offsets values in the return set will be std::nullopt.
+ * If @p starting_vertex_label_offsets is not specified then no organization is applied to the
+ * output, the label and offsets values in the return set will be std::nullopt.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is not specified then
- * the label output has values.  This will also result in the output being sorted by vertex label.
- * The offsets array in the return will be a CSR-style offsets array to identify the beginning of
- * each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is not
+ * specified then the label output has values.  This will also result in the output being sorted by
+ * vertex label. The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is specified then the
- * label output has values.  This will also result in the output being sorted by vertex label.  The
- * offsets array in the return will be a CSR-style offsets array to identify the beginning of each
- * label range in the data.  `labels.size() == (offsets.size() - 1)`.  Additionally, the data will
- * be shuffled so that all data with a particular label will be on the specified rank.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is specified
+ * then the label output has values.  This will also result in the output being sorted by vertex
+ * label.  The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * Additionally, the data will be shuffled so that all data with a particular label will be on the
+ * specified rank.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam bias_t Type of bias. Needs to be an integral type.
  * @tparam label_t Type of label. Needs to be an integral type.
  * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
  * true) are major indices
@@ -184,8 +191,8 @@ uniform_neighbor_sample(
  * corresponding edge can never be selected.
  * @param starting_vertices Device span of starting vertex IDs for the sampling.
  * In a multi-gpu context the starting vertices should be local to this GPU.
- * @param starting_vertex_labels Optional device span of labels associted with each starting vertex
- * for the sampling.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
  * @param label_to_output_comm_rank Optional tuple of device spans mapping label to a particular
  * output rank.  Element 0 of the tuple identifes the label, Element 1 of the tuple identifies the
  * output rank.  The label span must be sorted in ascending order.
@@ -229,7 +236,7 @@ biased_neighbor_sample(
   std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
   edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
   raft::device_span<vertex_t const> starting_vertices,
-  std::optional<raft::device_span<label_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<label_t const>> starting_vertex_label_offsets,
   std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
     label_to_output_comm_rank,
   raft::host_span<int32_t const> fan_out,
@@ -240,6 +247,349 @@ biased_neighbor_sample(
   bool dedupe_sources                             = false,
   bool do_expensive_check                         = false);
 
+struct sampling_flags_t {
+  /**
+   * Specifies how to handle prior sources. Default is DEFAULT.
+   */
+  prior_sources_behavior_t prior_sources_behavior{};
+
+  /**
+   * Specifies if the hop information should be returned.  Default is false.
+   */
+  bool return_hops{false};
+
+  /**
+   * If true then if a vertex v appears as a destination in hop X multiple times
+   * with the same label, it will only be passed once (for each label) as a source
+   * for the next hop.  Default is false.
+   */
+  bool dedupe_sources{false};
+
+  /**
+   * Specifies if random sampling is done with replacement
+   *   (true) or without replacement (false).  Default is true.
+   */
+  bool with_replacement{true};
+};
+
+/**
+ * @brief Homogeneous Uniform Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (uniformly) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with homogeneous fanouts
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Homogeneous Biased Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (with edge biases) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with homogeneous fanouts
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam bias_t Type of bias. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param edge_bias_view View object holding edge biases (to be used in biased sampling) for @p
+ * graph_view. Bias values should be non-negative and the sum of edge bias values from any vertex
+ * should not exceed std::numeric_limits<bias_t>::max(). 0 bias value indicates that the
+ * corresponding edge can never be selected.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Heterogeneous Uniform Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (uniformly) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with heterogeneous fanouts
+ * where the number of edge types is bigger than 1.
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level. The fanout value at hop x is given by the expression 'fanout[x*num_edge_types +
+ * edge_type_id]'
+ * @param num_edge_types Number of edge types where a value of 1 translates to homogeneous neighbor
+ * sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Heterogeneous Biased Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (with edge biases) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with heterogeneous fanouts
+ * where the number of edge types is bigger than 1.
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam bias_t Type of bias. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param edge_bias_view View object holding edge biases (to be used in biased sampling) for @p
+ * graph_view. Bias values should be non-negative and the sum of edge bias values from any vertex
+ * should not exceed std::numeric_limits<bias_t>::max(). 0 bias value indicates that the
+ * corresponding edge can never be selected.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level. The fanout value at hop x is given by the expression 'fanout[x*num_edge_types +
+ * edge_type_id]'
+ * @param num_edge_types Number of edge types where a value of 1 translates to homogeneous neighbor
+ * sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
 /*
  * @brief renumber sampled edge list and compress to the (D)CSR|(D)CSC format.
  *
diff --git a/cpp/include/cugraph_c/graph_functions.h b/cpp/include/cugraph_c/graph_functions.h
index ff7e439232a..964b2f2c8d6 100644
--- a/cpp/include/cugraph_c/graph_functions.h
+++ b/cpp/include/cugraph_c/graph_functions.h
@@ -104,6 +104,8 @@ cugraph_error_code_t cugraph_two_hop_neighbors(
 
 /**
  * @brief       Opaque induced subgraph type
+ *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_t
  */
 typedef struct {
   int32_t align_;
@@ -112,6 +114,8 @@ typedef struct {
 /**
  * @brief       Get the source vertex ids
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_sources
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of source vertex ids
  */
@@ -121,6 +125,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_sources(
 /**
  * @brief       Get the destination vertex ids
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_destinations
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of destination vertex ids
  */
@@ -130,6 +136,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_destinatio
 /**
  * @brief       Get the edge weights
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_weights
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of edge weights
  */
@@ -139,6 +147,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_weigh
 /**
  * @brief       Get the edge ids
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_ids
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of edge ids
  */
@@ -148,6 +158,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_ids(
 /**
  * @brief       Get the edge types
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_type_ids
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of edge types
  */
@@ -157,6 +169,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_type_
 /**
  * @brief       Get the subgraph offsets
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_offsets
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of subgraph identifiers
  */
@@ -166,6 +180,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_subgraph_o
 /**
  * @brief     Free induced subgraph
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_free
+ *
  * @param [in]    induced subgraph   Opaque pointer to induced subgraph
  */
 void cugraph_induced_subgraph_result_free(cugraph_induced_subgraph_result_t* induced_subgraph);
@@ -361,6 +377,92 @@ cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_out_degrees(
  */
 void cugraph_degrees_result_free(cugraph_degrees_result_t* degrees_result);
 
+/**
+ * @brief       Opaque edgelist type
+ *
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_edgelist_t;
+
+/**
+ * @brief       Get the source vertex ids
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of source vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_sources(cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the destination vertex ids
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of destination vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_destinations(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge weights
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of edge weights
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_weights(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge ids
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of edge ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_ids(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge types
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of edge types
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_type_ids(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge offsets
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of subgraph identifiers
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_offsets(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief     Free edgelist
+ *
+ * @param [in]    edgelist   Opaque pointer to edgelist
+ */
+void cugraph_edgelist_free(cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Construct the edge list from the graph view object.
+ *
+ * @param [in]  handle              Handle for accessing resources
+ * @param [in]  graph               Graph to operate on
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to edgelist
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_decompress_to_edgelist(const cugraph_resource_handle_t* handle,
+                                                    cugraph_graph_t* graph,
+                                                    bool_t do_expensive_check,
+                                                    cugraph_edgelist_t** result,
+                                                    cugraph_error_t** error);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index bb26e577915..ef75e726d80 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -199,6 +199,13 @@ typedef struct {
   int32_t align_;
 } cugraph_sampling_options_t;
 
+/**
+ * @brief     Opaque sampling options type
+ */
+typedef struct {
+  int32_t align_;
+} sampling_flags_t;
+
 /**
  * @brief     Enumeration for prior sources behavior
  */
@@ -323,6 +330,8 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
 /**
  * @brief     Uniform Neighborhood Sampling
  *
+ * @deprecated  This API will be deleted, use cugraph_homogeneous_uniform_neighbor_sample
+ *
  * Returns a sample of the neighborhood around specified start vertices.  Optionally, each
  * start vertex can be associated with a label, allowing the caller to specify multiple batches
  * of sampling requests in the same function call - which should improve GPU utilization.
@@ -348,8 +357,8 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
  * label_to_comm_rank[i].  If not specified then the output data will not be shuffled between ranks.
  * @param [in]  label_offsets Device array of the offsets for each label in the seed list.  This
  *                            parameter is only used with the retain_seeds option.
- * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm.
- *                           We only support fanout values of type INT32
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
  * @param [in,out] rng_state State of the random number generator, updated with each call
  * @param [in]  sampling_options
  *                           Opaque pointer defining the sampling options.
@@ -378,6 +387,8 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
 /**
  * @brief     Biased Neighborhood Sampling
  *
+ * @deprecated  This API will be deleted, use cugraph_homogeneous_biased_neighbor_sample.
+ *
  * Returns a sample of the neighborhood around specified start vertices.  Optionally, each
  * start vertex can be associated with a label, allowing the caller to specify multiple batches
  * of sampling requests in the same function call - which should improve GPU utilization.
@@ -406,8 +417,8 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
  * label_to_comm_rank[i].  If not specified then the output data will not be shuffled between ranks.
  * @param [in]  label_offsets Device array of the offsets for each label in the seed list.  This
  *                            parameter is only used with the retain_seeds option.
- * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm.
- *                           We only support fanout values of type INT32
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
  * @param [in,out] rng_state State of the random number generator, updated with each call
  * @param [in]  sampling_options
  *                           Opaque pointer defining the sampling options.
@@ -434,6 +445,186 @@ cugraph_error_code_t cugraph_biased_neighbor_sample(
   cugraph_sample_result_t** result,
   cugraph_error_t** error);
 
+/**
+ * @brief     Homogeneous Uniform Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_homogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief     Homogeneous Biased Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  edge_biases  Device array of edge biases to use for sampling.  If NULL
+ * use the edge weight as the bias. If set to NULL, edges will be sampled uniformly.
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_homogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief     Heterogeneous Uniform Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  num_edge_types Number of edge types where a value of 1 translates to homogeneous
+ * neighbor sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_heterogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief     Heterogeneous Biased Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  edge_biases  Device array of edge biases to use for sampling.  If NULL
+ * use the edge weight as the bias. If set to NULL, edges will be sampled uniformly.
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  num_edge_types Number of edge types where a value of 1 translates to homogeneous
+ * neighbor sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_heterogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
 /**
  * @deprecated This call should be replaced with cugraph_sample_result_get_majors
  * @brief     Get the source vertices from the sampling algorithm result
@@ -584,6 +775,26 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map(
 cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map_offsets(
   const cugraph_sample_result_t* result);
 
+/**
+ * @ingroup samplingC
+ * @brief     Get the edge renumber map
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the renumber map
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_renumber_map(
+  const cugraph_sample_result_t* result);
+
+/**
+ * @ingroup samplingC
+ * @brief     Get the edge renumber map offets
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the renumber map
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_renumber_map_offsets(
+  const cugraph_sample_result_t* result);
+
 /**
  * @ingroup samplingC
  * @brief     Free a sampling result
diff --git a/cpp/src/c_api/array.hpp b/cpp/src/c_api/array.hpp
index 048d2ee1cea..0ab30a1cb72 100644
--- a/cpp/src/c_api/array.hpp
+++ b/cpp/src/c_api/array.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,6 +125,27 @@ struct cugraph_type_erased_host_array_t {
     std::copy(vec.begin(), vec.end(), reinterpret_cast<T*>(data_.get()));
   }
 
+  cugraph_type_erased_host_array_t(cugraph_type_erased_host_array_view_t const* view_p)
+    : data_(std::make_unique<std::byte[]>(view_p->num_bytes_)),
+      size_(view_p->size_),
+      num_bytes_(view_p->num_bytes_),
+      type_(view_p->type_)
+  {
+    std::copy(view_p->data_, view_p->data_ + num_bytes_, data_.get());
+  }
+
+  template <typename T>
+  T* as_type()
+  {
+    return reinterpret_cast<T*>(data_.get());
+  }
+
+  template <typename T>
+  T const* as_type() const
+  {
+    return reinterpret_cast<T const*>(data_.get());
+  }
+
   auto view()
   {
     return new cugraph_type_erased_host_array_view_t{data_.get(), size_, num_bytes_, type_};
diff --git a/cpp/src/c_api/decompress_to_edgelist.cpp b/cpp/src/c_api/decompress_to_edgelist.cpp
new file mode 100644
index 00000000000..75bf0c0fd60
--- /dev/null
+++ b/cpp/src/c_api/decompress_to_edgelist.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/abstract_functor.hpp"
+#include "c_api/core_result.hpp"
+#include "c_api/edgelist.hpp"
+#include "c_api/graph.hpp"
+#include "c_api/resource_handle.hpp"
+#include "c_api/utils.hpp"
+
+#include <cugraph_c/algorithms.h>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+
+#include <optional>
+
+namespace {
+
+struct decompress_to_edgelist_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_graph_t* graph_{};
+
+  cugraph::c_api::cugraph_core_result_t const* core_result_{};
+  bool do_expensive_check_{};
+  cugraph::c_api::cugraph_edgelist_t* result_{};
+
+  decompress_to_edgelist_functor(cugraph_resource_handle_t const* handle,
+                                 cugraph_graph_t* graph,
+                                 bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      if constexpr (store_transposed) {
+        error_code_ = cugraph::c_api::
+          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+            handle_, graph_, error_.get());
+        if (error_code_ != CUGRAPH_SUCCESS) return;
+      }
+
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>*>(
+          graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto edge_weights = reinterpret_cast<cugraph::edge_property_t<
+        cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+        weight_t>*>(graph_->edge_weights_);
+
+      auto edge_ids = reinterpret_cast<cugraph::edge_property_t<
+        cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+        edge_t>*>(graph_->edge_ids_);
+
+      auto edge_types = reinterpret_cast<cugraph::edge_property_t<
+        cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+        edge_type_type_t>*>(graph_->edge_types_);
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      auto [result_src, result_dst, result_wgt, result_edge_id, result_edge_type] =
+        cugraph::decompress_to_edgelist<vertex_t,
+                                        edge_t,
+                                        weight_t,
+                                        edge_type_type_t,
+                                        store_transposed,
+                                        multi_gpu>(
+          handle_,
+          graph_view,
+          (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+          (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+          (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+          (number_map != nullptr) ? std::make_optional<raft::device_span<vertex_t const>>(
+                                      number_map->data(), number_map->size())
+                                  : std::nullopt,
+          do_expensive_check_);
+
+      result_ = new cugraph::c_api::cugraph_edgelist_t{
+        new cugraph::c_api::cugraph_type_erased_device_array_t(result_src, graph_->vertex_type_),
+        new cugraph::c_api::cugraph_type_erased_device_array_t(result_dst, graph_->vertex_type_),
+        result_wgt ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_wgt,
+                                                                            graph_->weight_type_)
+                   : NULL,
+        result_edge_id ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_edge_id,
+                                                                                graph_->edge_type_)
+                       : NULL,
+        result_edge_type ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                             *result_edge_type, graph_->edge_type_id_type_)
+                         : NULL,
+        NULL};
+    }
+  }
+};
+
+}  // namespace
+
+extern "C" cugraph_error_code_t cugraph_decompress_to_edgelist(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  bool_t do_expensive_check,
+  cugraph_edgelist_t** result,
+  cugraph_error_t** error)
+{
+  decompress_to_edgelist_functor functor(handle, graph, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/c_api/edgelist.cpp b/cpp/src/c_api/edgelist.cpp
new file mode 100644
index 00000000000..640b2bf2853
--- /dev/null
+++ b/cpp/src/c_api/edgelist.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/edgelist.hpp"
+
+#include <cugraph_c/algorithms.h>
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_sources(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->src_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_destinations(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->dst_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_weights(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return (internal_pointer->wgt_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->wgt_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_ids(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return (internal_pointer->edge_ids_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_ids_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_type_ids(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return (internal_pointer->edge_type_ids_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_type_ids_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_offsets(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+    internal_pointer->subgraph_offsets_->view());
+}
+
+extern "C" void cugraph_edgelist_free(cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  delete internal_pointer->src_;
+  delete internal_pointer->dst_;
+  delete internal_pointer->wgt_;
+  delete internal_pointer->edge_ids_;
+  delete internal_pointer->edge_type_ids_;
+  delete internal_pointer->subgraph_offsets_;
+  delete internal_pointer;
+}
diff --git a/cpp/src/c_api/edgelist.hpp b/cpp/src/c_api/edgelist.hpp
new file mode 100644
index 00000000000..bc0f2d337f1
--- /dev/null
+++ b/cpp/src/c_api/edgelist.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "c_api/array.hpp"
+
+namespace cugraph {
+namespace c_api {
+
+struct cugraph_edgelist_t {
+  cugraph_type_erased_device_array_t* src_{};
+  cugraph_type_erased_device_array_t* dst_{};
+  cugraph_type_erased_device_array_t* wgt_{};
+  cugraph_type_erased_device_array_t* edge_ids_{};
+  cugraph_type_erased_device_array_t* edge_type_ids_{};
+  cugraph_type_erased_device_array_t* subgraph_offsets_{};
+};
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/src/c_api/graph_functions.cpp b/cpp/src/c_api/graph_functions.cpp
index df741a349d2..8778369dbe6 100644
--- a/cpp/src/c_api/graph_functions.cpp
+++ b/cpp/src/c_api/graph_functions.cpp
@@ -84,7 +84,7 @@ struct create_vertex_pairs_functor : public cugraph::c_api::abstract_functor {
                               std::nullopt,
                               std::nullopt);
       }
-
+      // FIXME: use std::tuple (template) instead.
       result_ = new cugraph::c_api::cugraph_vertex_pairs_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(first_copy, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(second_copy, graph_->vertex_type_)};
diff --git a/cpp/src/c_api/neighbor_sampling.cpp b/cpp/src/c_api/neighbor_sampling.cpp
index 69306806030..be3a44d813a 100644
--- a/cpp/src/c_api/neighbor_sampling.cpp
+++ b/cpp/src/c_api/neighbor_sampling.cpp
@@ -16,12 +16,15 @@
 
 #include "c_api/abstract_functor.hpp"
 #include "c_api/graph.hpp"
+#include "c_api/graph_helper.hpp"
 #include "c_api/properties.hpp"
 #include "c_api/random.hpp"
 #include "c_api/resource_handle.hpp"
 #include "c_api/utils.hpp"
+#include "sampling/detail/sampling_utils.hpp"
 
 #include <cugraph_c/algorithms.h>
+#include <cugraph_c/sampling_algorithms.h>
 
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
@@ -44,6 +47,13 @@ struct cugraph_sampling_options_t {
   bool_t retain_seeds_{FALSE};
 };
 
+struct sampling_flags_t {
+  prior_sources_behavior_t prior_sources_behavior_{prior_sources_behavior_t::DEFAULT};
+  bool_t return_hops_{FALSE};
+  bool_t dedupe_sources_{FALSE};
+  bool_t with_replacement_{FALSE};
+};
+
 struct cugraph_sample_result_t {
   cugraph_type_erased_device_array_t* major_offsets_{nullptr};
   cugraph_type_erased_device_array_t* majors_{nullptr};
@@ -56,6 +66,8 @@ struct cugraph_sample_result_t {
   cugraph_type_erased_device_array_t* label_{nullptr};
   cugraph_type_erased_device_array_t* renumber_map_{nullptr};
   cugraph_type_erased_device_array_t* renumber_map_offsets_{nullptr};
+  cugraph_type_erased_device_array_t* edge_renumber_map_{nullptr};
+  cugraph_type_erased_device_array_t* edge_renumber_map_offsets_{nullptr};
 };
 
 }  // namespace c_api
@@ -63,6 +75,7 @@ struct cugraph_sample_result_t {
 
 namespace {
 
+// Deprecated functor
 struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
@@ -398,11 +411,14 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                        : nullptr,
         (renumber_map_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
                                    renumber_map_offsets.value(), SIZE_T)
-                               : nullptr};
+                               : nullptr,
+        nullptr,
+        nullptr};
     }
   }
 };
 
+// Deprecated functor
 struct biased_neighbor_sampling_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
@@ -748,7 +764,598 @@ struct biased_neighbor_sampling_functor : public cugraph::c_api::abstract_functo
                        : nullptr,
         (renumber_map_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
                                    renumber_map_offsets.value(), SIZE_T)
-                               : nullptr};
+                               : nullptr,
+        nullptr,
+        nullptr};
+    }
+  }
+};
+
+struct neighbor_sampling_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
+  cugraph::c_api::cugraph_graph_t* graph_{nullptr};
+  cugraph::c_api::cugraph_edge_property_view_t const* edge_biases_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertex_offsets_{nullptr};
+  cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr};
+  int num_edge_types_{};
+  cugraph::c_api::cugraph_sampling_options_t options_{};
+  bool is_biased_{false};
+  bool do_expensive_check_{false};
+  cugraph::c_api::cugraph_sample_result_t* result_{nullptr};
+
+  neighbor_sampling_functor(cugraph_resource_handle_t const* handle,
+                            cugraph_rng_state_t* rng_state,
+                            cugraph_graph_t* graph,
+                            cugraph_edge_property_view_t const* edge_biases,
+                            cugraph_type_erased_device_array_view_t const* start_vertices,
+                            cugraph_type_erased_device_array_view_t const* start_vertex_offsets,
+                            cugraph_type_erased_host_array_view_t const* fan_out,
+                            int num_edge_types,
+                            cugraph::c_api::cugraph_sampling_options_t options,
+                            bool is_biased,
+                            bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      rng_state_(reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(rng_state)),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      edge_biases_(
+        reinterpret_cast<cugraph::c_api::cugraph_edge_property_view_t const*>(edge_biases)),
+      start_vertices_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+          start_vertices)),
+      start_vertex_offsets_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+          start_vertex_offsets)),
+      fan_out_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)),
+      num_edge_types_(num_edge_types),
+      options_(options),
+      is_biased_(is_biased),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    using label_t = int32_t;
+
+    // FIXME: Think about how to handle SG vice MG
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      // uniform_nbr_sample expects store_transposed == false
+      if constexpr (store_transposed) {
+        error_code_ = cugraph::c_api::
+          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+            handle_, graph_, error_.get());
+        if (error_code_ != CUGRAPH_SUCCESS) return;
+      }
+
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, false, multi_gpu>*>(graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto edge_weights = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                                 weight_t>*>(graph_->edge_weights_);
+
+      auto edge_ids = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                                 edge_t>*>(graph_->edge_ids_);
+
+      auto edge_types = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                                 edge_type_t>*>(graph_->edge_types_);
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      auto edge_biases =
+        edge_biases_ ? reinterpret_cast<cugraph::edge_property_view_t<edge_t, weight_t const*>*>(
+                         edge_biases_->edge_property_)
+                     : nullptr;
+
+      rmm::device_uvector<vertex_t> start_vertices(start_vertices_->size_, handle_.get_stream());
+      raft::copy(start_vertices.data(),
+                 start_vertices_->as_type<vertex_t>(),
+                 start_vertices.size(),
+                 handle_.get_stream());
+
+      std::optional<rmm::device_uvector<label_t>> start_vertex_labels{std::nullopt};
+      std::optional<rmm::device_uvector<label_t>> local_label_to_comm_rank{std::nullopt};
+      std::optional<rmm::device_uvector<label_t>> label_to_comm_rank{
+        std::nullopt};  // global after allgatherv
+
+      std::optional<rmm::device_uvector<edge_t>> renumbered_and_sorted_edge_id_renumber_map(
+        std::nullopt);
+      std::optional<rmm::device_uvector<size_t>>
+        renumbered_and_sorted_edge_id_renumber_map_label_type_offsets(std::nullopt);
+
+      if (start_vertex_offsets_ != nullptr) {
+        // Retrieve the start_vertex_labels
+        start_vertex_labels = cugraph::detail::convert_starting_vertex_label_offsets_to_labels(
+          handle_,
+          raft::device_span<size_t const>{start_vertex_offsets_->as_type<size_t>(),
+                                          start_vertex_offsets_->size_});
+
+        // Get the number of labels on each GPU
+
+        if constexpr (multi_gpu) {
+          auto num_local_labels = start_vertex_offsets_->size_ - 1;
+
+          auto global_labels = cugraph::host_scalar_allgather(
+            handle_.get_comms(), num_local_labels, handle_.get_stream());
+
+          std::exclusive_scan(
+            global_labels.begin(), global_labels.end(), global_labels.begin(), label_t{0});
+
+          // Compute the global start_vertex_label_offsets
+
+          cugraph::detail::transform_increment_ints(
+            raft::device_span<label_t>{(*start_vertex_labels).data(),
+                                       (*start_vertex_labels).size()},
+            (label_t)global_labels[handle_.get_comms().get_rank()],
+            handle_.get_stream());
+
+          rmm::device_uvector<label_t> unique_labels((*start_vertex_labels).size(),
+                                                     handle_.get_stream());
+          raft::copy(unique_labels.data(),
+                     (*start_vertex_labels).data(),
+                     unique_labels.size(),
+                     handle_.get_stream());
+
+          // Get unique labels
+          // sort the start_vertex_labels
+          cugraph::detail::sort_ints(
+            handle_.get_stream(),
+            raft::device_span<label_t>{unique_labels.data(), unique_labels.size()});
+
+          auto num_unique_labels = cugraph::detail::unique_ints(
+            handle_.get_stream(),
+            raft::device_span<label_t>{unique_labels.data(), unique_labels.size()});
+
+          (*local_label_to_comm_rank).resize(num_unique_labels, handle_.get_stream());
+
+          cugraph::detail::scalar_fill(
+            handle_.get_stream(),
+            (*local_label_to_comm_rank).begin(),  // This should be rename to rank
+            (*local_label_to_comm_rank).size(),
+            label_t{handle_.get_comms().get_rank()});
+
+          // Perform allgather to get global_label_to_comm_rank_d_vector
+          auto recvcounts = cugraph::host_scalar_allgather(
+            handle_.get_comms(), num_unique_labels, handle_.get_stream());
+
+          std::vector<size_t> displacements(recvcounts.size());
+          std::exclusive_scan(
+            recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0});
+
+          (*label_to_comm_rank)
+            .resize(displacements.back() + recvcounts.back(), handle_.get_stream());
+
+          cugraph::device_allgatherv(handle_.get_comms(),
+                                     (*local_label_to_comm_rank).begin(),
+                                     (*label_to_comm_rank).begin(),
+                                     recvcounts,
+                                     displacements,
+                                     handle_.get_stream());
+
+          std::tie(start_vertices, *start_vertex_labels) =
+            cugraph::detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+              handle_, std::move(start_vertices), std::move(*start_vertex_labels));
+        }
+      } else {
+        if constexpr (multi_gpu) {
+          start_vertices =
+            cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+              handle_, std::move(start_vertices));
+        }
+      }
+      //
+      // Need to renumber start_vertices
+      //
+      cugraph::renumber_local_ext_vertices<vertex_t, multi_gpu>(
+        handle_,
+        start_vertices.data(),
+        start_vertices.size(),
+        number_map->data(),
+        graph_view.local_vertex_partition_range_first(),
+        graph_view.local_vertex_partition_range_last(),
+        do_expensive_check_);
+
+      rmm::device_uvector<vertex_t> src(0, handle_.get_stream());
+      rmm::device_uvector<vertex_t> dst(0, handle_.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> edge_id{std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> edge_type{std::nullopt};
+      std::optional<rmm::device_uvector<int32_t>> hop{std::nullopt};
+      std::optional<rmm::device_uvector<label_t>> edge_label{std::nullopt};
+      std::optional<rmm::device_uvector<size_t>> offsets{std::nullopt};
+
+      // FIXME: For biased sampling, the user should pass either biases or edge weights,
+      // otherwised throw an error and suggest the user to call uniform neighbor sample instead
+
+      if (num_edge_types_ > 1) {
+        // call heterogeneous neighbor sample
+        if (is_biased_) {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::heterogeneous_biased_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              (edge_biases != nullptr) ? *edge_biases : edge_weights->view(),
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              num_edge_types_,
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        } else {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::heterogeneous_uniform_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              num_edge_types_,
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        }
+      } else {
+        // Call homogeneous neighbor sample
+        if (is_biased_) {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::homogeneous_biased_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              (edge_biases != nullptr) ? *edge_biases : edge_weights->view(),
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        } else {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::homogeneous_uniform_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        }
+      }
+
+      std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            src.data(),
+                                                            src.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            dst.data(),
+                                                            dst.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      std::optional<rmm::device_uvector<vertex_t>> majors{std::nullopt};
+      rmm::device_uvector<vertex_t> minors(0, handle_.get_stream());
+      std::optional<rmm::device_uvector<size_t>> major_offsets{std::nullopt};
+
+      std::optional<rmm::device_uvector<size_t>> label_hop_offsets{std::nullopt};
+
+      std::optional<rmm::device_uvector<vertex_t>> renumber_map{std::nullopt};
+      std::optional<rmm::device_uvector<size_t>> renumber_map_offsets{std::nullopt};
+
+      bool src_is_major = (options_.compression_type_ == cugraph_compression_type_t::CSR) ||
+                          (options_.compression_type_ == cugraph_compression_type_t::DCSR) ||
+                          (options_.compression_type_ == cugraph_compression_type_t::COO);
+
+      // Extract the edge_label from the offsets
+      if (offsets) {
+        edge_label = cugraph::c_api::expand_sparse_offsets(
+          raft::device_span<size_t const>{(*offsets).data(), (*offsets).size()},
+          label_t{0},
+          handle_.get_stream());
+      }
+
+      if (options_.renumber_results_) {
+        if (num_edge_types_ == 1) {  // homogeneous renumbering
+          if (options_.compression_type_ == cugraph_compression_type_t::COO) {
+            // COO
+
+            rmm::device_uvector<vertex_t> output_majors(0, handle_.get_stream());
+            rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+            std::tie(output_majors,
+                     minors,
+                     wgt,
+                     edge_id,
+                     edge_type,
+                     label_hop_offsets,
+                     output_renumber_map,
+                     renumber_map_offsets) =
+              cugraph::renumber_and_sort_sampled_edgelist<vertex_t>(
+                handle_,
+                std::move(src),
+                std::move(dst),
+                std::move(wgt),
+                std::move(edge_id),
+                std::move(edge_type),
+                std::move(hop),
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<vertex_t const>{
+                      start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                  : std::nullopt,
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<size_t const>{
+                      start_vertex_offsets_->as_type<size_t>(), start_vertex_offsets_->size_})
+                  : std::nullopt,
+                offsets ? std::make_optional(
+                            raft::device_span<size_t const>{offsets->data(), offsets->size()})
+                        : std::nullopt,
+                offsets ? (*offsets).size() - 1 : size_t{1},
+                hop ? fan_out_->size_ : size_t{1},
+                src_is_major,
+                do_expensive_check_);
+
+            majors.emplace(std::move(output_majors));
+            renumber_map.emplace(std::move(output_renumber_map));
+          } else {
+            // (D)CSC, (D)CSR
+
+            bool doubly_compress =
+              (options_.compression_type_ == cugraph_compression_type_t::DCSR) ||
+              (options_.compression_type_ == cugraph_compression_type_t::DCSC);
+
+            rmm::device_uvector<size_t> output_major_offsets(0, handle_.get_stream());
+            rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+
+            std::tie(majors,
+                     output_major_offsets,
+                     minors,
+                     wgt,
+                     edge_id,
+                     edge_type,
+                     label_hop_offsets,
+                     output_renumber_map,
+                     renumber_map_offsets) =
+              cugraph::renumber_and_compress_sampled_edgelist<vertex_t>(
+                handle_,
+                std::move(src),
+                std::move(dst),
+                std::move(wgt),
+                std::move(edge_id),
+                std::move(edge_type),
+                std::move(hop),
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<vertex_t const>{
+                      start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                  : std::nullopt,
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<size_t const>{
+                      start_vertex_offsets_->as_type<size_t>(), start_vertex_offsets_->size_})
+                  : std::nullopt,
+                offsets ? std::make_optional(
+                            raft::device_span<size_t const>{offsets->data(), offsets->size()})
+                        : std::nullopt,
+                edge_label ? (*offsets).size() - 1 : size_t{1},  // FIXME: update edge_label
+                hop ? fan_out_->size_ : size_t{1},
+                src_is_major,
+                options_.compress_per_hop_,
+                doubly_compress,
+                do_expensive_check_);
+
+            renumber_map.emplace(std::move(output_renumber_map));
+            major_offsets.emplace(std::move(output_major_offsets));
+          }
+
+          // These are now represented by label_hop_offsets
+          hop.reset();
+          offsets.reset();
+
+        } else {  // heterogeneous renumbering
+
+          rmm::device_uvector<vertex_t> vertex_type_offsets(
+            graph_view.local_vertex_partition_range_size(), handle_.get_stream());
+
+          cugraph::detail::sequence_fill(handle_.get_stream(),
+                                         vertex_type_offsets.begin(),
+                                         vertex_type_offsets.size(),
+                                         vertex_t{0}  // FIXME: Update array
+          );
+
+          rmm::device_uvector<vertex_t> output_majors(0, handle_.get_stream());
+          rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+
+          // extract the edge_type from label_type_hop_offsets
+          std::optional<rmm::device_uvector<size_t>> label_type_hop_offsets{std::nullopt};
+          std::tie(output_majors,
+                   minors,
+                   wgt,
+                   edge_id,
+                   label_type_hop_offsets,  // Contains information about the type and hop offsets
+                   output_renumber_map,
+                   (*renumber_map_offsets),
+                   renumbered_and_sorted_edge_id_renumber_map,
+                   renumbered_and_sorted_edge_id_renumber_map_label_type_offsets) =
+            cugraph::heterogeneous_renumber_and_sort_sampled_edgelist<vertex_t>(
+              handle_,
+              std::move(src),
+              std::move(dst),
+              std::move(wgt),
+              std::move(edge_id),
+              std::move(edge_type),
+              std::move(hop),
+              options_.retain_seeds_
+                ? std::make_optional(raft::device_span<vertex_t const>{
+                    start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                : std::nullopt,
+              options_.retain_seeds_
+                ? std::make_optional(raft::device_span<size_t const>{
+                    start_vertex_offsets_->as_type<size_t>(), start_vertex_offsets_->size_})
+                : std::nullopt,
+              offsets ? std::make_optional(
+                          raft::device_span<size_t const>{offsets->data(), offsets->size()})
+                      : std::nullopt,
+              raft::device_span<vertex_t const>{vertex_type_offsets.data(),
+                                                vertex_type_offsets.size()},
+
+              edge_label ? (*offsets).size() - 1 : size_t{1},
+              hop ? fan_out_->size_ : size_t{1},
+              size_t{1},
+              num_edge_types_,
+              src_is_major,
+              do_expensive_check_);
+          if (edge_type) {
+            (*edge_type)
+              .resize(raft::device_span<size_t const>{(*label_type_hop_offsets).data(),
+                                                      (*label_type_hop_offsets).size()}
+                          .back() -
+                        1,
+                      handle_.get_stream());
+            cugraph::detail::sequence_fill(
+              handle_.get_stream(), (*edge_type).begin(), (*edge_type).size(), edge_type_t{0});
+          }
+
+          majors.emplace(std::move(output_majors));
+          // FIXME: Need to update renumber_map because default values are being passed
+          renumber_map.emplace(std::move(output_renumber_map));
+        }
+
+      } else {
+        if (options_.compression_type_ != cugraph_compression_type_t::COO) {
+          CUGRAPH_FAIL("Can only use COO format if not renumbering");
+        }
+
+        std::tie(src, dst, wgt, edge_id, edge_type, label_hop_offsets) =
+          cugraph::sort_sampled_edgelist(handle_,
+                                         std::move(src),
+                                         std::move(dst),
+                                         std::move(wgt),
+                                         std::move(edge_id),
+                                         std::move(edge_type),
+                                         std::move(hop),
+                                         offsets
+                                           ? std::make_optional(raft::device_span<size_t const>{
+                                               offsets->data(), offsets->size()})
+                                           : std::nullopt,
+                                         // derive label size from offset size instead of performing
+                                         // thrust::unique on edge_label.
+                                         edge_label ? (*offsets).size() - 1 : size_t{1},
+                                         hop ? fan_out_->size_ : size_t{1},
+                                         src_is_major,
+                                         do_expensive_check_);
+
+        majors.emplace(std::move(src));
+        minors = std::move(dst);
+
+        hop.reset();
+        offsets.reset();
+      }
+
+      result_ = new cugraph::c_api::cugraph_sample_result_t{
+        (major_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*major_offsets, SIZE_T)
+          : nullptr,
+        (majors)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*majors, graph_->vertex_type_)
+          : nullptr,
+        new cugraph::c_api::cugraph_type_erased_device_array_t(minors, graph_->vertex_type_),
+        (edge_id)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*edge_id, graph_->edge_type_)
+          : nullptr,
+        (edge_type) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                        *edge_type, graph_->edge_type_id_type_)
+                    : nullptr,
+        (wgt) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*wgt, graph_->weight_type_)
+              : nullptr,
+        (hop) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*hop, INT32)
+              : nullptr,  // FIXME get rid of this
+        (label_hop_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*label_hop_offsets, SIZE_T)
+          : nullptr,
+        (edge_label)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(edge_label.value(), INT32)
+          : nullptr,
+        (renumber_map) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                           renumber_map.value(), graph_->vertex_type_)
+                       : nullptr,
+        (renumber_map_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                                   renumber_map_offsets.value(), SIZE_T)
+                               : nullptr,
+        (renumbered_and_sorted_edge_id_renumber_map)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+              renumbered_and_sorted_edge_id_renumber_map.value(), graph_->edge_type_)
+          : nullptr,
+        (renumbered_and_sorted_edge_id_renumber_map_label_type_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+              renumbered_and_sorted_edge_id_renumber_map_label_type_offsets.value(), SIZE_T)
+          : nullptr};
     }
   }
 };
@@ -985,6 +1592,26 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_re
                internal_pointer->renumber_map_offsets_->view());
 }
 
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_renumber_map(
+  const cugraph_sample_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
+  return internal_pointer->renumber_map_ == nullptr
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_renumber_map_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t*
+cugraph_sample_result_get_edge_renumber_map_offsets(const cugraph_sample_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
+  return internal_pointer->renumber_map_ == nullptr
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_renumber_map_offsets_->view());
+}
+
 extern "C" cugraph_error_code_t cugraph_test_uniform_neighborhood_sample_result_create(
   const cugraph_resource_handle_t* handle,
   const cugraph_type_erased_device_array_view_t* srcs,
@@ -1292,6 +1919,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
     "fan_out should be of type int",
     *error);
 
+  //  Deprecated functor
   uniform_neighbor_sampling_functor functor{handle,
                                             graph,
                                             start_vertices,
@@ -1369,6 +1997,7 @@ cugraph_error_code_t cugraph_biased_neighbor_sample(
     "fan_out should be of type int",
     *error);
 
+  // Deprecated functor
   biased_neighbor_sampling_functor functor{handle,
                                            graph,
                                            edge_biases,
@@ -1383,3 +2012,249 @@ cugraph_error_code_t cugraph_biased_neighbor_sample(
                                            do_expensive_check};
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
+
+cugraph_error_code_t cugraph_heterogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out should be of type int",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    nullptr,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    num_edge_types,
+                                    std::move(options_cpp),
+                                    FALSE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+cugraph_error_code_t cugraph_heterogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  CAPI_EXPECTS(
+    (edge_biases != nullptr) ||
+      (reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->edge_weights_ != nullptr),
+    CUGRAPH_INVALID_INPUT,
+    "edge_biases is required if the graph is not weighted",
+    *error);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out should be of type int",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    edge_biases,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    num_edge_types,
+                                    std::move(options_cpp),
+                                    TRUE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+cugraph_error_code_t cugraph_homogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,  // RENAME?
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out type must be INT32",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    nullptr,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    1,  // num_edge_types
+                                    std::move(options_cpp),
+                                    FALSE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+cugraph_error_code_t cugraph_homogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  CAPI_EXPECTS(
+    (edge_biases != nullptr) ||
+      (reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->edge_weights_ != nullptr),
+    CUGRAPH_INVALID_INPUT,
+    "edge_biases is required if the graph is not weighted",
+    *error);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out type must be INT32",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    edge_biases,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    1,  // num_edge_types
+                                    std::move(options_cpp),
+                                    TRUE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/detail/utility_wrappers_32.cu b/cpp/src/detail/utility_wrappers_32.cu
index de407f12493..879a1adf337 100644
--- a/cpp/src/detail/utility_wrappers_32.cu
+++ b/cpp/src/detail/utility_wrappers_32.cu
@@ -63,6 +63,10 @@ template void scalar_fill(raft::handle_t const& handle, size_t* d_value, size_t
 
 template void scalar_fill(raft::handle_t const& handle, float* d_value, size_t size, float value);
 
+template void sort_ints(raft::handle_t const& handle, raft::device_span<int32_t> values);
+
+template size_t unique_ints(raft::handle_t const& handle, raft::device_span<int32_t> values);
+
 template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             int32_t* d_value,
                             size_t size,
@@ -73,6 +77,10 @@ template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             size_t size,
                             uint32_t start_value);
 
+template void transform_increment_ints(raft::device_span<int32_t> values,
+                                       int32_t value,
+                                       rmm::cuda_stream_view const& stream_view);
+
 template void stride_fill(rmm::cuda_stream_view const& stream_view,
                           int32_t* d_value,
                           size_t size,
diff --git a/cpp/src/detail/utility_wrappers_64.cu b/cpp/src/detail/utility_wrappers_64.cu
index 2c136d5902b..742cb18d718 100644
--- a/cpp/src/detail/utility_wrappers_64.cu
+++ b/cpp/src/detail/utility_wrappers_64.cu
@@ -61,6 +61,10 @@ template void scalar_fill(raft::handle_t const& handle,
 
 template void scalar_fill(raft::handle_t const& handle, double* d_value, size_t size, double value);
 
+template void sort_ints(raft::handle_t const& handle, raft::device_span<int64_t> values);
+
+template size_t unique_ints(raft::handle_t const& handle, raft::device_span<int64_t> values);
+
 template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             int64_t* d_value,
                             size_t size,
@@ -71,6 +75,10 @@ template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             size_t size,
                             uint64_t start_value);
 
+template void transform_increment_ints(raft::device_span<int64_t> values,
+                                       int64_t value,
+                                       rmm::cuda_stream_view const& stream_view);
+
 template void stride_fill(rmm::cuda_stream_view const& stream_view,
                           int64_t* d_value,
                           size_t size,
diff --git a/cpp/src/detail/utility_wrappers_impl.cuh b/cpp/src/detail/utility_wrappers_impl.cuh
index 074d7044261..93bd14c4d06 100644
--- a/cpp/src/detail/utility_wrappers_impl.cuh
+++ b/cpp/src/detail/utility_wrappers_impl.cuh
@@ -36,6 +36,7 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
+#include <thrust/unique.h>
 
 namespace cugraph {
 namespace detail {
@@ -63,6 +64,20 @@ void scalar_fill(raft::handle_t const& handle, value_t* d_value, size_t size, va
   thrust::fill_n(handle.get_thrust_policy(), d_value, size, value);
 }
 
+template <typename value_t>
+void sort_ints(raft::handle_t const& handle, raft::device_span<value_t> values)
+{
+  thrust::sort(handle.get_thrust_policy(), values.begin(), values.end());
+}
+
+template <typename value_t>
+size_t unique_ints(raft::handle_t const& handle, raft::device_span<value_t> values)
+{
+  auto unique_element_last =
+    thrust::unique(handle.get_thrust_policy(), values.begin(), values.end());
+  return thrust::distance(values.begin(), unique_element_last);
+}
+
 template <typename value_t>
 void sequence_fill(rmm::cuda_stream_view const& stream_view,
                    value_t* d_value,
@@ -72,6 +87,20 @@ void sequence_fill(rmm::cuda_stream_view const& stream_view,
   thrust::sequence(rmm::exec_policy(stream_view), d_value, d_value + size, start_value);
 }
 
+template <typename value_t>
+void transform_increment_ints(raft::device_span<value_t> values,
+                              value_t incr,
+                              rmm::cuda_stream_view const& stream_view)
+{
+  thrust::transform(rmm::exec_policy(stream_view),
+                    values.begin(),
+                    values.end(),
+                    values.begin(),
+                    cuda::proclaim_return_type<value_t>([incr] __device__(value_t value) {
+                      return static_cast<value_t>(value + incr);
+                    }));
+}
+
 template <typename value_t>
 void stride_fill(rmm::cuda_stream_view const& stream_view,
                  value_t* d_value,
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index b39895129dc..00f73b5c263 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -287,10 +287,8 @@ all_pairs_similarity(raft::handle_t const& handle,
     //  computing/updating topk with each batch
 
     //   FIXME: Experiment with this and adjust as necessary
-    // size_t const
-    // MAX_PAIRS_PER_BATCH{static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
-    // (1 << 15)};
-    size_t const MAX_PAIRS_PER_BATCH{100};
+    size_t const MAX_PAIRS_PER_BATCH{
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 15)};
 
     rmm::device_uvector<edge_t> degrees = graph_view.compute_out_degrees(handle);
     rmm::device_uvector<size_t> two_hop_degrees(degrees.size() + 1, handle.get_stream());
@@ -362,195 +360,205 @@ all_pairs_similarity(raft::handle_t const& handle,
                       1,
                       handle.get_stream());
 
+    handle.sync_stream();
+
     std::tie(batch_offsets, std::ignore) = compute_offset_aligned_element_chunks(
       handle,
       raft::device_span<size_t const>{two_hop_degree_offsets.data(), two_hop_degree_offsets.size()},
       sum_two_hop_degrees,
       MAX_PAIRS_PER_BATCH);
 
-    for (size_t batch_number = 0; batch_number < (batch_offsets.size() - 1); ++batch_number) {
-      if (batch_offsets[batch_number + 1] > batch_offsets[batch_number]) {
-        auto [offsets, v2] =
-          k_hop_nbrs(handle,
-                     graph_view,
-                     raft::device_span<vertex_t const>{
-                       tmp_vertices.data() + batch_offsets[batch_number],
-                       batch_offsets[batch_number + 1] - batch_offsets[batch_number]},
-                     2,
-                     do_expensive_check);
-
-        auto v1 = cugraph::detail::expand_sparse_offsets(
-          raft::device_span<size_t const>{offsets.data(), offsets.size()},
-          vertex_t{0},
-          handle.get_stream());
+    // FIXME: compute_offset_aligned_element_chunks can return duplicates.  Should it?  Should
+    // explore
+    //  whether this functionality should be pushed into that function
+    batch_offsets.resize(std::distance(batch_offsets.begin(),
+                                       std::unique(batch_offsets.begin(), batch_offsets.end())));
 
-        cugraph::unrenumber_local_int_vertices(
-          handle,
-          v1.data(),
-          v1.size(),
+    size_t num_batches = batch_offsets.size() - 1;
+    if constexpr (multi_gpu) {
+      num_batches = cugraph::host_scalar_allreduce(
+        handle.get_comms(), num_batches, raft::comms::op_t::MAX, handle.get_stream());
+    }
+
+    for (size_t batch_number = 0; batch_number < num_batches; ++batch_number) {
+      raft::device_span<vertex_t const> batch_seeds{tmp_vertices.data(), size_t{0}};
+
+      if (((batch_number + 1) < batch_offsets.size()) &&
+          (batch_offsets[batch_number + 1] > batch_offsets[batch_number])) {
+        batch_seeds = raft::device_span<vertex_t const>{
           tmp_vertices.data() + batch_offsets[batch_number],
-          vertex_t{0},
-          static_cast<vertex_t>(batch_offsets[batch_number + 1] - batch_offsets[batch_number]),
-          do_expensive_check);
+          batch_offsets[batch_number + 1] - batch_offsets[batch_number]};
+      }
+
+      auto [offsets, v2] = k_hop_nbrs(handle, graph_view, batch_seeds, 2, do_expensive_check);
 
-        auto new_size = thrust::distance(
+      auto v1 = cugraph::detail::expand_sparse_offsets(
+        raft::device_span<size_t const>{offsets.data(), offsets.size()},
+        vertex_t{0},
+        handle.get_stream());
+
+      cugraph::unrenumber_local_int_vertices(
+        handle,
+        v1.data(),
+        v1.size(),
+        tmp_vertices.data() + batch_offsets[batch_number],
+        vertex_t{0},
+        static_cast<vertex_t>(batch_offsets[batch_number + 1] - batch_offsets[batch_number]),
+        do_expensive_check);
+
+      auto new_size = thrust::distance(
+        thrust::make_zip_iterator(v1.begin(), v2.begin()),
+        thrust::remove_if(
+          handle.get_thrust_policy(),
           thrust::make_zip_iterator(v1.begin(), v2.begin()),
-          thrust::remove_if(
-            handle.get_thrust_policy(),
-            thrust::make_zip_iterator(v1.begin(), v2.begin()),
-            thrust::make_zip_iterator(v1.end(), v2.end()),
-            [] __device__(auto tuple) { return thrust::get<0>(tuple) == thrust::get<1>(tuple); }));
-
-        v1.resize(new_size, handle.get_stream());
-        v2.resize(new_size, handle.get_stream());
-
-        if constexpr (multi_gpu) {
-          // shuffle vertex pairs
-          auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
-
-          std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
-            detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
-                                                                                           edge_t,
-                                                                                           weight_t,
-                                                                                           int>(
-              handle,
-              std::move(v1),
-              std::move(v2),
-              std::nullopt,
-              std::nullopt,
-              std::nullopt,
-              vertex_partition_range_lasts);
-        }
+          thrust::make_zip_iterator(v1.end(), v2.end()),
+          [] __device__(auto tuple) { return thrust::get<0>(tuple) == thrust::get<1>(tuple); }));
 
-        auto score =
-          similarity(handle,
-                     graph_view,
-                     edge_weight_view,
-                     std::make_tuple(raft::device_span<vertex_t const>{v1.data(), v1.size()},
-                                     raft::device_span<vertex_t const>{v2.data(), v2.size()}),
-                     functor,
-                     coeff,
-                     do_expensive_check);
-
-        // Add a remove_if to remove items that are less than the last topk element
-        new_size = thrust::distance(
-          thrust::make_zip_iterator(score.begin(), v1.begin(), v2.begin()),
-          thrust::remove_if(handle.get_thrust_policy(),
-                            thrust::make_zip_iterator(score.begin(), v1.begin(), v2.begin()),
-                            thrust::make_zip_iterator(score.end(), v1.end(), v2.end()),
-                            [similarity_threshold] __device__(auto tuple) {
-                              return thrust::get<0>(tuple) < similarity_threshold;
-                            }));
-
-        score.resize(new_size, handle.get_stream());
-        v1.resize(new_size, handle.get_stream());
-        v2.resize(new_size, handle.get_stream());
-
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            score.begin(),
-                            score.end(),
-                            thrust::make_zip_iterator(v1.begin(), v2.begin()),
-                            thrust::greater<weight_t>{});
-
-        size_t v1_keep = std::min(*topk, v1.size());
-
-        if (score.size() < (top_v1.size() + v1_keep)) {
-          score.resize(top_v1.size() + v1_keep, handle.get_stream());
-          v1.resize(score.size(), handle.get_stream());
-          v2.resize(score.size(), handle.get_stream());
-        }
+      v1.resize(new_size, handle.get_stream());
+      v2.resize(new_size, handle.get_stream());
 
-        thrust::copy(
-          handle.get_thrust_policy(), top_v1.begin(), top_v1.end(), v1.begin() + v1_keep);
-        thrust::copy(
-          handle.get_thrust_policy(), top_v2.begin(), top_v2.end(), v2.begin() + v1_keep);
-        thrust::copy(
-          handle.get_thrust_policy(), top_score.begin(), top_score.end(), score.begin() + v1_keep);
-
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            score.begin(),
-                            score.end(),
-                            thrust::make_zip_iterator(v1.begin(), v2.begin()),
-                            thrust::greater<weight_t>{});
-
-        if (top_v1.size() < std::min(*topk, v1.size())) {
-          top_v1.resize(std::min(*topk, v1.size()), handle.get_stream());
-          top_v2.resize(top_v1.size(), handle.get_stream());
-          top_score.resize(top_v1.size(), handle.get_stream());
-        }
+      if constexpr (multi_gpu) {
+        // shuffle vertex pairs
+        auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
+
+        std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
+          detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
+                                                                                         edge_t,
+                                                                                         weight_t,
+                                                                                         int>(
+            handle,
+            std::move(v1),
+            std::move(v2),
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            vertex_partition_range_lasts);
+      }
 
-        thrust::copy(
-          handle.get_thrust_policy(), v1.begin(), v1.begin() + top_v1.size(), top_v1.begin());
-        thrust::copy(
-          handle.get_thrust_policy(), v2.begin(), v2.begin() + top_v1.size(), top_v2.begin());
-        thrust::copy(handle.get_thrust_policy(),
-                     score.begin(),
-                     score.begin() + top_v1.size(),
-                     top_score.begin());
-
-        if constexpr (multi_gpu) {
-          bool is_root  = handle.get_comms().get_rank() == int{0};
-          auto rx_sizes = cugraph::host_scalar_gather(
-            handle.get_comms(), top_v1.size(), int{0}, handle.get_stream());
-          std::vector<size_t> rx_displs;
-          size_t gathered_size{0};
-
-          if (is_root) {
-            rx_displs.resize(handle.get_comms().get_size());
-            rx_displs[0] = 0;
-            std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
-            gathered_size = std::reduce(rx_sizes.begin(), rx_sizes.end());
-          }
+      auto score =
+        similarity(handle,
+                   graph_view,
+                   edge_weight_view,
+                   std::make_tuple(raft::device_span<vertex_t const>{v1.data(), v1.size()},
+                                   raft::device_span<vertex_t const>{v2.data(), v2.size()}),
+                   functor,
+                   coeff,
+                   do_expensive_check);
+
+      // Add a remove_if to remove items that are less than the last topk element
+      new_size = thrust::distance(
+        thrust::make_zip_iterator(score.begin(), v1.begin(), v2.begin()),
+        thrust::remove_if(handle.get_thrust_policy(),
+                          thrust::make_zip_iterator(score.begin(), v1.begin(), v2.begin()),
+                          thrust::make_zip_iterator(score.end(), v1.end(), v2.end()),
+                          [similarity_threshold] __device__(auto tuple) {
+                            return thrust::get<0>(tuple) < similarity_threshold;
+                          }));
+
+      score.resize(new_size, handle.get_stream());
+      v1.resize(new_size, handle.get_stream());
+      v2.resize(new_size, handle.get_stream());
+
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          score.begin(),
+                          score.end(),
+                          thrust::make_zip_iterator(v1.begin(), v2.begin()),
+                          thrust::greater<weight_t>{});
+
+      size_t v1_keep = std::min(*topk, v1.size());
+
+      if (score.size() < (top_v1.size() + v1_keep)) {
+        score.resize(top_v1.size() + v1_keep, handle.get_stream());
+        v1.resize(score.size(), handle.get_stream());
+        v2.resize(score.size(), handle.get_stream());
+      }
 
-          rmm::device_uvector<vertex_t> gathered_v1(gathered_size, handle.get_stream());
-          rmm::device_uvector<vertex_t> gathered_v2(gathered_size, handle.get_stream());
-          rmm::device_uvector<weight_t> gathered_score(gathered_size, handle.get_stream());
-
-          cugraph::device_gatherv(
-            handle.get_comms(),
-            thrust::make_zip_iterator(top_v1.begin(), top_v2.begin(), top_score.begin()),
-            thrust::make_zip_iterator(
-              gathered_v1.begin(), gathered_v2.begin(), gathered_score.begin()),
-
-            top_v1.size(),
-            rx_sizes,
-            rx_displs,
-            int{0},
-            handle.get_stream());
-
-          if (is_root) {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                gathered_score.begin(),
-                                gathered_score.end(),
-                                thrust::make_zip_iterator(gathered_v1.begin(), gathered_v2.begin()),
-                                thrust::greater<weight_t>{});
-
-            if (gathered_v1.size() > *topk) {
-              gathered_v1.resize(*topk, handle.get_stream());
-              gathered_v2.resize(*topk, handle.get_stream());
-              gathered_score.resize(*topk, handle.get_stream());
-            }
-
-            top_v1    = std::move(gathered_v1);
-            top_v2    = std::move(gathered_v2);
-            top_score = std::move(gathered_score);
-          } else {
-            top_v1.resize(0, handle.get_stream());
-            top_v2.resize(0, handle.get_stream());
-            top_score.resize(0, handle.get_stream());
-          }
+      thrust::copy(handle.get_thrust_policy(), top_v1.begin(), top_v1.end(), v1.begin() + v1_keep);
+      thrust::copy(handle.get_thrust_policy(), top_v2.begin(), top_v2.end(), v2.begin() + v1_keep);
+      thrust::copy(
+        handle.get_thrust_policy(), top_score.begin(), top_score.end(), score.begin() + v1_keep);
+
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          score.begin(),
+                          score.end(),
+                          thrust::make_zip_iterator(v1.begin(), v2.begin()),
+                          thrust::greater<weight_t>{});
+
+      if (top_v1.size() < std::min(*topk, v1.size())) {
+        top_v1.resize(std::min(*topk, v1.size()), handle.get_stream());
+        top_v2.resize(top_v1.size(), handle.get_stream());
+        top_score.resize(top_v1.size(), handle.get_stream());
+      }
+
+      thrust::copy(
+        handle.get_thrust_policy(), v1.begin(), v1.begin() + top_v1.size(), top_v1.begin());
+      thrust::copy(
+        handle.get_thrust_policy(), v2.begin(), v2.begin() + top_v1.size(), top_v2.begin());
+      thrust::copy(handle.get_thrust_policy(),
+                   score.begin(),
+                   score.begin() + top_v1.size(),
+                   top_score.begin());
+
+      if constexpr (multi_gpu) {
+        bool is_root  = handle.get_comms().get_rank() == int{0};
+        auto rx_sizes = cugraph::host_scalar_gather(
+          handle.get_comms(), top_v1.size(), int{0}, handle.get_stream());
+        std::vector<size_t> rx_displs;
+        size_t gathered_size{0};
+
+        if (is_root) {
+          rx_displs.resize(handle.get_comms().get_size());
+          rx_displs[0] = 0;
+          std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+          gathered_size = std::reduce(rx_sizes.begin(), rx_sizes.end());
         }
 
-        if (top_score.size() == *topk) {
-          raft::update_host(
-            &similarity_threshold, top_score.data() + *topk - 1, 1, handle.get_stream());
+        rmm::device_uvector<vertex_t> gathered_v1(gathered_size, handle.get_stream());
+        rmm::device_uvector<vertex_t> gathered_v2(gathered_size, handle.get_stream());
+        rmm::device_uvector<weight_t> gathered_score(gathered_size, handle.get_stream());
+
+        cugraph::device_gatherv(
+          handle.get_comms(),
+          thrust::make_zip_iterator(top_v1.begin(), top_v2.begin(), top_score.begin()),
+          thrust::make_zip_iterator(
+            gathered_v1.begin(), gathered_v2.begin(), gathered_score.begin()),
+          top_v1.size(),
+          rx_sizes,
+          rx_displs,
+          int{0},
+          handle.get_stream());
 
-          if constexpr (multi_gpu) {
-            similarity_threshold = host_scalar_bcast(
-              handle.get_comms(), similarity_threshold, int{0}, handle.get_stream());
+        if (is_root) {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              gathered_score.begin(),
+                              gathered_score.end(),
+                              thrust::make_zip_iterator(gathered_v1.begin(), gathered_v2.begin()),
+                              thrust::greater<weight_t>{});
+
+          if (gathered_v1.size() > *topk) {
+            gathered_v1.resize(*topk, handle.get_stream());
+            gathered_v2.resize(*topk, handle.get_stream());
+            gathered_score.resize(*topk, handle.get_stream());
           }
+
+          top_v1    = std::move(gathered_v1);
+          top_v2    = std::move(gathered_v2);
+          top_score = std::move(gathered_score);
+        } else {
+          top_v1.resize(0, handle.get_stream());
+          top_v2.resize(0, handle.get_stream());
+          top_score.resize(0, handle.get_stream());
         }
       }
+
+      if (top_score.size() == *topk) {
+        raft::update_host(
+          &similarity_threshold, top_score.data() + *topk - 1, 1, handle.get_stream());
+      }
+      if constexpr (multi_gpu) {
+        similarity_threshold =
+          host_scalar_bcast(handle.get_comms(), similarity_threshold, int{0}, handle.get_stream());
+      }
     }
 
     return std::make_tuple(std::move(top_v1), std::move(top_v2), std::move(top_score));
diff --git a/cpp/src/sampling/detail/conversion_utilities.cu b/cpp/src/sampling/detail/conversion_utilities.cu
new file mode 100644
index 00000000000..0279735dc1f
--- /dev/null
+++ b/cpp/src/sampling/detail/conversion_utilities.cu
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sampling/detail/sampling_utils.hpp"
+
+#include <cugraph/utilities/misc_utils.cuh>
+
+namespace cugraph {
+namespace detail {
+
+rmm::device_uvector<int32_t> convert_starting_vertex_label_offsets_to_labels(
+  raft::handle_t const& handle, raft::device_span<size_t const> starting_vertex_label_offsets)
+{
+  return expand_sparse_offsets(starting_vertex_label_offsets, int32_t{0}, handle.get_stream());
+}
+
+template <typename label_t>
+rmm::device_uvector<int32_t> flatten_label_map(
+  raft::handle_t const& handle,
+  std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>
+    label_to_output_comm_rank)
+{
+  label_t max_label = thrust::reduce(handle.get_thrust_policy(),
+                                     std::get<0>(label_to_output_comm_rank).begin(),
+                                     std::get<0>(label_to_output_comm_rank).end(),
+                                     label_t{0},
+                                     thrust::maximum<label_t>());
+
+  rmm::device_uvector<int32_t> label_map(max_label + 1, handle.get_stream());
+
+  thrust::fill(handle.get_thrust_policy(), label_map.begin(), label_map.end(), int32_t{0});
+  thrust::scatter(handle.get_thrust_policy(),
+                  std::get<1>(label_to_output_comm_rank).begin(),
+                  std::get<1>(label_to_output_comm_rank).end(),
+                  std::get<0>(label_to_output_comm_rank).begin(),
+                  label_map.begin());
+
+  return label_map;
+}
+
+template rmm::device_uvector<int32_t> flatten_label_map(
+  raft::handle_t const& handle,
+  std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>
+    label_to_output_comm_rank);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/sampling/detail/sampling_utils.hpp b/cpp/src/sampling/detail/sampling_utils.hpp
index 102f9ec58f7..17eb8dd0873 100644
--- a/cpp/src/sampling/detail/sampling_utils.hpp
+++ b/cpp/src/sampling/detail/sampling_utils.hpp
@@ -293,7 +293,41 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<edge_type_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<label_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
+
+/**
+ * @brief   Convert the starting vertex offsets into starting vertex labels
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param starting_vertex_label_offsets Offsets array defining where each vertex label begins
+ *
+ * @returns device vector containing labels for each starting vertex
+ */
+rmm::device_uvector<int32_t> convert_starting_vertex_label_offsets_to_labels(
+  raft::handle_t const& handle, raft::device_span<size_t const> starting_vertex_label_offsets);
+
+/**
+ * @brief   Flatten the legacy label_to_output_comm_rank into the new structure
+ *
+ * Legacy structure supported arbitrary labels, the new structure is a dense mapping of labels from
+ * [0,n).
+ *
+ * @tparam label_t typename for the label
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param label_to_output_comm_rank  A tuple containing label ids and the comm rank each label
+ * should be assigned to
+ *
+ * @returns device vector containing the mapping to comm_rank.  Entry `i` will be the comm rank
+ * destination for label `i`.
+ */
+template <typename label_t>
+rmm::device_uvector<int32_t> flatten_label_map(
+  raft::handle_t const& handle,
+  std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>
     label_to_output_comm_rank);
+
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh b/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
index ec14e99baec..391dd99b1df 100644
--- a/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
+++ b/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
@@ -41,14 +41,12 @@ namespace detail {
 
 template <typename label_t>
 struct shuffle_to_output_comm_rank_t {
-  raft::device_span<label_t const> output_label_;
   raft::device_span<int32_t const> output_rank_;
 
   template <typename key_t>
   __device__ int32_t operator()(key_t key) const
   {
-    auto pos = thrust::lower_bound(thrust::seq, output_label_.begin(), output_label_.end(), key);
-    return output_rank_[thrust::distance(output_label_.begin(), pos)];
+    return output_rank_[key];
   }
 };
 
@@ -206,8 +204,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<edge_type_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<label_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank)
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank)
 {
   std::optional<rmm::device_uvector<size_t>> offsets{std::nullopt};
 
@@ -215,8 +212,6 @@ shuffle_and_organize_output(
     sort_sampled_tuples(handle, majors, minors, weights, edge_ids, edge_types, hops, *labels);
 
     if (label_to_output_comm_rank) {
-      CUGRAPH_EXPECTS(labels, "labels must be specified in order to shuffle sampling results");
-
       auto& comm           = handle.get_comms();
       auto const comm_size = comm.get_size();
 
@@ -247,8 +242,7 @@ shuffle_and_organize_output(
                                           edge_ids->begin(),
                                           edge_types->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -282,8 +276,7 @@ shuffle_and_organize_output(
                                           weights->begin(),
                                           edge_ids->begin(),
                                           edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -317,8 +310,7 @@ shuffle_and_organize_output(
                                           weights->begin(),
                                           edge_ids->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -347,8 +339,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -383,8 +374,7 @@ shuffle_and_organize_output(
                                           weights->begin(),
                                           edge_types->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -413,8 +403,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), weights->begin(), edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -444,8 +433,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), weights->begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -471,8 +459,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -505,8 +492,7 @@ shuffle_and_organize_output(
                                           edge_ids->begin(),
                                           edge_types->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -535,8 +521,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), edge_ids->begin(), edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -566,8 +551,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), edge_ids->begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -593,8 +577,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -623,8 +606,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), edge_types->begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -651,8 +633,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -678,8 +659,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -702,8 +682,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
diff --git a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
index 73a152487ca..4a264469c97 100644
--- a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
+++ b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
@@ -36,8 +36,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -56,8 +55,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
index ff7a716e609..f66ce3e2d63 100644
--- a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
+++ b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
@@ -36,8 +36,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -56,8 +55,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_impl.hpp b/cpp/src/sampling/neighbor_sampling_impl.hpp
index d8e8cc2b756..ccca71cdf20 100644
--- a/cpp/src/sampling/neighbor_sampling_impl.hpp
+++ b/cpp/src/sampling/neighbor_sampling_impl.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "prims/fill_edge_property.cuh"
+#include "prims/transform_e.cuh"
 #include "sampling/detail/sampling_utils.hpp"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
@@ -48,41 +50,34 @@ std::tuple<rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<int32_t>>,
            std::optional<rmm::device_uvector<label_t>>,
            std::optional<rmm::device_uvector<size_t>>>
-neighbor_sample_impl(
-  raft::handle_t const& handle,
-  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
-  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
-  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
-  std::optional<edge_property_view_t<edge_t, bias_t const*>> edge_bias_view,
-  raft::device_span<vertex_t const> this_frontier_vertices,
-  std::optional<raft::device_span<label_t const>> this_frontier_vertex_labels,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  raft::random::RngState& rng_state,
-  bool do_expensive_check)
+neighbor_sample_impl(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
+                     graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+                     std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+                     std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+                     std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+                     std::optional<edge_property_view_t<edge_t, bias_t const*>> edge_bias_view,
+                     raft::device_span<vertex_t const> starting_vertices,
+                     std::optional<raft::device_span<label_t const>> starting_vertex_labels,
+                     std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+                     raft::host_span<int32_t const> fan_out,
+                     edge_type_t num_edge_types,
+                     bool return_hops,
+                     bool with_replacement,
+                     prior_sources_behavior_t prior_sources_behavior,
+                     bool dedupe_sources,
+                     bool do_expensive_check)
 {
   static_assert(std::is_floating_point_v<bias_t>);
 
-  CUGRAPH_EXPECTS(fan_out.size() > 0, "Invalid input argument: number of levels must be non-zero.");
-  CUGRAPH_EXPECTS(
-    fan_out.size() <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-    "Invalid input argument: number of levels should not overflow int32_t");  // as we use int32_t
-                                                                              // to store hops
-
   if constexpr (!multi_gpu) {
     CUGRAPH_EXPECTS(!label_to_output_comm_rank,
                     "cannot specify output GPU mapping in SG implementation");
   }
 
   CUGRAPH_EXPECTS(
-    !label_to_output_comm_rank || this_frontier_vertex_labels,
-    "cannot specify output GPU mapping without also specifying this_frontier_vertex_labels");
+    !label_to_output_comm_rank || starting_vertex_labels,
+    "cannot specify output GPU mapping without also specifying starting_vertex_labels");
 
   if (do_expensive_check) {
     if (edge_bias_view) {
@@ -96,10 +91,45 @@ neighbor_sample_impl(
                       "Invalid input argument: sum of neighboring edge bias values should not "
                       "exceed std::numeric_limits<bias_t>::max() for any vertex.");
     }
+  }
+
+  CUGRAPH_EXPECTS(fan_out.size() > 0, "Invalid input argument: number of levels must be non-zero.");
+  CUGRAPH_EXPECTS(
+    fan_out.size() <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
+    "Invalid input argument: number of levels should not overflow int32_t");  // as we use int32_t
+                                                                              // to store hops
 
-    if (label_to_output_comm_rank) {
-      CUGRAPH_EXPECTS(cugraph::detail::is_sorted(handle, std::get<0>(*label_to_output_comm_rank)),
-                      "Labels in label_to_output_comm_rank must be sorted");
+  std::vector<
+    cugraph::edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>, bool>>
+    edge_masks_vector{};
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> modified_graph_view = graph_view;
+  edge_masks_vector.reserve(num_edge_types);
+
+  if (num_edge_types > 1) {
+    for (int i = 0; i < num_edge_types; i++) {
+      cugraph::edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>, bool>
+        edge_mask(handle, graph_view);
+
+      cugraph::fill_edge_property(
+        handle, modified_graph_view, edge_mask.mutable_view(), bool{true});
+
+      cugraph::transform_e(
+        handle,
+        modified_graph_view,
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        *edge_type_view,
+        [valid_edge_type = i] __device__(auto src,
+                                         auto dst,
+                                         thrust::nullopt_t,
+                                         thrust::nullopt_t,
+                                         /*thrust::nullopt_t*/ auto edge_type) {
+          return edge_type == valid_edge_type;
+        },
+        edge_mask.mutable_view(),
+        false);
+
+      edge_masks_vector.push_back(std::move(edge_mask));
     }
   }
 
@@ -114,8 +144,8 @@ neighbor_sample_impl(
     edge_type_view ? std::make_optional(std::vector<rmm::device_uvector<edge_type_t>>{})
                    : std::nullopt;
   auto level_result_label_vectors =
-    this_frontier_vertex_labels ? std::make_optional(std::vector<rmm::device_uvector<label_t>>{})
-                                : std::nullopt;
+    starting_vertex_labels ? std::make_optional(std::vector<rmm::device_uvector<label_t>>{})
+                           : std::nullopt;
 
   level_result_src_vectors.reserve(fan_out.size());
   level_result_dst_vectors.reserve(fan_out.size());
@@ -126,7 +156,7 @@ neighbor_sample_impl(
 
   rmm::device_uvector<vertex_t> frontier_vertices(0, handle.get_stream());
   auto frontier_vertex_labels =
-    this_frontier_vertex_labels
+    starting_vertex_labels
       ? std::make_optional(rmm::device_uvector<label_t>{0, handle.get_stream()})
       : std::nullopt;
 
@@ -137,84 +167,95 @@ neighbor_sample_impl(
   if (prior_sources_behavior == prior_sources_behavior_t::EXCLUDE) {
     vertex_used_as_source = std::make_optional(
       std::make_tuple(rmm::device_uvector<vertex_t>{0, handle.get_stream()},
-                      this_frontier_vertex_labels
+                      starting_vertex_labels
                         ? std::make_optional(rmm::device_uvector<label_t>{0, handle.get_stream()})
                         : std::nullopt));
   }
 
   std::vector<size_t> level_sizes{};
-  int32_t hop{0};
-  for (auto&& k_level : fan_out) {
-    rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
-    std::optional<rmm::device_uvector<weight_t>> weights{std::nullopt};
-    std::optional<rmm::device_uvector<edge_t>> edge_ids{std::nullopt};
-    std::optional<rmm::device_uvector<edge_type_t>> edge_types{std::nullopt};
-    std::optional<rmm::device_uvector<int32_t>> labels{std::nullopt};
-
-    if (k_level > 0) {
-      std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
-        sample_edges(handle,
-                     graph_view,
-                     edge_weight_view,
-                     edge_id_view,
-                     edge_type_view,
-                     edge_bias_view,
-                     rng_state,
-                     this_frontier_vertices,
-                     this_frontier_vertex_labels,
-                     static_cast<size_t>(k_level),
-                     with_replacement);
-    } else {
-      std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
-        gather_one_hop_edgelist(handle,
-                                graph_view,
-                                edge_weight_view,
-                                edge_id_view,
-                                edge_type_view,
-                                this_frontier_vertices,
-                                this_frontier_vertex_labels);
-    }
 
-    level_sizes.push_back(srcs.size());
-
-    level_result_src_vectors.push_back(std::move(srcs));
-    level_result_dst_vectors.push_back(std::move(dsts));
-    if (weights) { (*level_result_weight_vectors).push_back(std::move(*weights)); }
-    if (edge_ids) { (*level_result_edge_id_vectors).push_back(std::move(*edge_ids)); }
-    if (edge_types) { (*level_result_edge_type_vectors).push_back(std::move(*edge_types)); }
-    if (labels) { (*level_result_label_vectors).push_back(std::move(*labels)); }
-
-    ++hop;
-    if (hop < fan_out.size()) {
-      // FIXME:  We should modify vertex_partition_range_lasts to return a raft::host_span
-      //  rather than making a copy.
-      auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
-      std::tie(frontier_vertices, frontier_vertex_labels, vertex_used_as_source) =
-        prepare_next_frontier(
-          handle,
-          this_frontier_vertices,
-          this_frontier_vertex_labels,
-          raft::device_span<vertex_t const>{level_result_dst_vectors.back().data(),
-                                            level_result_dst_vectors.back().size()},
-          frontier_vertex_labels ? std::make_optional(raft::device_span<label_t const>(
-                                     level_result_label_vectors->back().data(),
-                                     level_result_label_vectors->back().size()))
-                                 : std::nullopt,
-          std::move(vertex_used_as_source),
-          graph_view.local_vertex_partition_view(),
-          vertex_partition_range_lasts,
-          prior_sources_behavior,
-          dedupe_sources,
-          do_expensive_check);
-
-      this_frontier_vertices =
-        raft::device_span<vertex_t const>(frontier_vertices.data(), frontier_vertices.size());
-
-      if (frontier_vertex_labels) {
-        this_frontier_vertex_labels = raft::device_span<label_t const>(
-          frontier_vertex_labels->data(), frontier_vertex_labels->size());
+  // Get the number of hop. If homogeneous neighbor sample, num_edge_types = 1
+  auto num_hops = ((fan_out.size() % num_edge_types) == 0)
+                    ? (fan_out.size() / num_edge_types)
+                    : ((fan_out.size() / num_edge_types) + 1);
+
+  for (auto hop = 0; hop < num_hops; hop++) {
+    for (auto edge_type_id = 0; edge_type_id < num_edge_types; edge_type_id++) {
+      auto k_level = fan_out[(hop * num_edge_types) + edge_type_id];
+      rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> weights{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> edge_ids{std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> edge_types{std::nullopt};
+      std::optional<rmm::device_uvector<int32_t>> labels{std::nullopt};
+
+      if (num_edge_types > 1) {
+        modified_graph_view.attach_edge_mask(edge_masks_vector[edge_type_id].view());
+      }
+
+      if (k_level > 0) {
+        std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
+          sample_edges(handle,
+                       modified_graph_view,
+                       edge_weight_view,
+                       edge_id_view,
+                       edge_type_view,
+                       edge_bias_view,
+                       rng_state,
+                       starting_vertices,
+                       starting_vertex_labels,
+                       static_cast<size_t>(k_level),
+                       with_replacement);
+      } else {
+        std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
+          gather_one_hop_edgelist(handle,
+                                  modified_graph_view,
+                                  edge_weight_view,
+                                  edge_id_view,
+                                  edge_type_view,
+                                  starting_vertices,
+                                  starting_vertex_labels);
       }
+
+      level_sizes.push_back(srcs.size());
+      level_result_src_vectors.push_back(std::move(srcs));
+      level_result_dst_vectors.push_back(std::move(dsts));
+
+      if (weights) { (*level_result_weight_vectors).push_back(std::move(*weights)); }
+      if (edge_ids) { (*level_result_edge_id_vectors).push_back(std::move(*edge_ids)); }
+      if (edge_types) { (*level_result_edge_type_vectors).push_back(std::move(*edge_types)); }
+      if (labels) { (*level_result_label_vectors).push_back(std::move(*labels)); }
+
+      if (num_edge_types > 1) { modified_graph_view.clear_edge_mask(); }
+    }
+
+    // FIXME:  We should modify vertex_partition_range_lasts to return a raft::host_span
+    //  rather than making a copy.
+    auto vertex_partition_range_lasts = modified_graph_view.vertex_partition_range_lasts();
+    std::tie(frontier_vertices, frontier_vertex_labels, vertex_used_as_source) =
+      prepare_next_frontier(
+        handle,
+        starting_vertices,
+        starting_vertex_labels,
+        raft::device_span<vertex_t const>{level_result_dst_vectors.back().data(),
+                                          level_result_dst_vectors.back().size()},
+        frontier_vertex_labels
+          ? std::make_optional(raft::device_span<label_t const>(
+              level_result_label_vectors->back().data(), level_result_label_vectors->back().size()))
+          : std::nullopt,
+        std::move(vertex_used_as_source),
+        modified_graph_view.local_vertex_partition_view(),
+        vertex_partition_range_lasts,
+        prior_sources_behavior,
+        dedupe_sources,
+        do_expensive_check);
+
+    starting_vertices =
+      raft::device_span<vertex_t const>(frontier_vertices.data(), frontier_vertices.size());
+
+    if (frontier_vertex_labels) {
+      starting_vertex_labels = raft::device_span<label_t const>(frontier_vertex_labels->data(),
+                                                                frontier_vertex_labels->size());
     }
   }
 
@@ -368,8 +409,16 @@ uniform_neighbor_sample(
   bool do_expensive_check)
 {
   using bias_t = weight_t;  // dummy
+
+  rmm::device_uvector<int32_t> label_map(0, handle.get_stream());
+
+  if (label_to_output_comm_rank) {
+    label_map = detail::flatten_label_map(handle, *label_to_output_comm_rank);
+  }
+
   return detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
     handle,
+    rng_state,
     graph_view,
     edge_weight_view,
     edge_id_view,
@@ -377,13 +426,15 @@ uniform_neighbor_sample(
     std::nullopt,
     starting_vertices,
     starting_vertex_labels,
-    label_to_output_comm_rank,
+    label_to_output_comm_rank
+      ? std::make_optional(raft::device_span<int32_t const>{label_map.data(), label_map.size()})
+      : std::nullopt,
     fan_out,
+    edge_type_t{1},
     return_hops,
     with_replacement,
     prior_sources_behavior,
     dedupe_sources,
-    rng_state,
     do_expensive_check);
 }
 
@@ -422,8 +473,15 @@ biased_neighbor_sample(
   bool dedupe_sources,
   bool do_expensive_check)
 {
+  rmm::device_uvector<int32_t> label_map(0, handle.get_stream());
+
+  if (label_to_output_comm_rank) {
+    label_map = detail::flatten_label_map(handle, *label_to_output_comm_rank);
+  }
+
   return detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
     handle,
+    rng_state,
     graph_view,
     edge_weight_view,
     edge_id_view,
@@ -431,14 +489,252 @@ biased_neighbor_sample(
     edge_bias_view,
     starting_vertices,
     starting_vertex_labels,
-    label_to_output_comm_rank,
+    label_to_output_comm_rank
+      ? std::make_optional(raft::device_span<int32_t const>{label_map.data(), label_map.size()})
+      : std::nullopt,
     fan_out,
+    edge_type_t{1},
     return_hops,
     with_replacement,
     prior_sources_behavior,
     dedupe_sources,
-    rng_state,
     do_expensive_check);
 }
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  using bias_t = weight_t;  // dummy
+
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::optional<edge_property_view_t<edge_t, bias_t const*>>{
+        std::nullopt},  // Optional edge_bias_view
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      num_edge_types,
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::make_optional(edge_bias_view),
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      num_edge_types,
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  using bias_t = weight_t;  // dummy
+
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::optional<edge_property_view_t<edge_t, bias_t const*>>{
+        std::nullopt},  // Optional edge_bias_view
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      edge_type_t{1},
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::make_optional(edge_bias_view),
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      edge_type_t{1},
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cpp b/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cpp
deleted file mode 100644
index f61c1c10c53..00000000000
--- a/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, float const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, double const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cu b/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cu
new file mode 100644
index 00000000000..d848935cc7e
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cpp b/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cpp
deleted file mode 100644
index ea3f6b466da..00000000000
--- a/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, float const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, double const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cu b/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cu
new file mode 100644
index 00000000000..505deec51f5
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cpp b/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cpp
deleted file mode 100644
index 0f0affbb323..00000000000
--- a/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, float const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, double const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cu b/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cu
new file mode 100644
index 00000000000..72bbb4e27a8
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cpp b/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cpp
deleted file mode 100644
index 70dd9a59842..00000000000
--- a/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, float const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, double const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cu b/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cu
new file mode 100644
index 00000000000..6aa8c71429a
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/utilities/shuffle_vertices.cuh b/cpp/src/utilities/shuffle_vertices.cuh
index adce03f7c29..5ed6513816f 100644
--- a/cpp/src/utilities/shuffle_vertices.cuh
+++ b/cpp/src/utilities/shuffle_vertices.cuh
@@ -44,22 +44,43 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id_impl(
   return d_rx_vertices;
 }
 
-template <typename vertex_t, typename value_t, typename func_t>
-std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
-shuffle_vertices_and_values_by_gpu_id_impl(raft::handle_t const& handle,
-                                           rmm::device_uvector<vertex_t>&& d_vertices,
-                                           rmm::device_uvector<value_t>&& d_values,
-                                           func_t func)
+template <typename vertex_t, typename value0_t, typename value1_t, typename func_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<value0_t>,
+           std::optional<rmm::device_uvector<value1_t>>>
+shuffle_vertices_and_values_by_gpu_id_impl(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& d_vertices,
+  rmm::device_uvector<value0_t>&& d_values_0,
+  std::optional<rmm::device_uvector<value1_t>>&& d_values_1,
+  func_t func)
 {
-  std::tie(d_vertices, d_values, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_kv_pairs(
-    handle.get_comms(),
-    d_vertices.begin(),
-    d_vertices.end(),
-    d_values.begin(),
-    [key_func = func] __device__(auto val) { return key_func(val); },
-    handle.get_stream());
-
-  return std::make_tuple(std::move(d_vertices), std::move(d_values));
+  if (d_values_1) {
+    auto [d_shuffled_vertices, d_values, counts] = cugraph::groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.get_comms(),
+      d_vertices.begin(),
+      d_vertices.end(),
+      thrust::make_zip_iterator(d_values_0.begin(), (*d_values_1).begin()),
+      [key_func = func] __device__(auto val) { return key_func(val); },
+      handle.get_stream());
+
+    return std::make_tuple(std::move(d_shuffled_vertices),
+                           std::move(std::get<0>(d_values)),
+                           std::make_optional(std::move(std::get<1>(d_values))));
+  } else {
+    auto [d_shuffled_vertices, d_values, counts] = cugraph::groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.get_comms(),
+      d_vertices.begin(),
+      d_vertices.end(),
+      d_values_0.begin(),
+      [key_func = func] __device__(auto val) { return key_func(val); },
+      handle.get_stream());
+
+    auto d_values_1 = std::optional<rmm::device_uvector<int32_t>>{std::nullopt};
+
+    return std::make_tuple(
+      std::move(d_shuffled_vertices), std::move(d_values), std::move(d_values_1));
+  }
 }
 
 }  // namespace
@@ -96,12 +117,18 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
   auto const minor_comm_size = minor_comm.get_size();
 
-  return shuffle_vertices_and_values_by_gpu_id_impl(
+  rmm::device_uvector<vertex_t> d_vertices(0, handle.get_stream());
+  rmm::device_uvector<value_t> d_values(0, handle.get_stream());
+
+  std::tie(d_vertices, d_values, std::ignore) = shuffle_vertices_and_values_by_gpu_id_impl(
     handle,
     std::move(vertices),
     std::move(values),
+    std::optional<rmm::device_uvector<int32_t>>{std::nullopt},
     cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size});
+
+  return std::make_tuple(std::move(d_vertices), std::move(d_values));
 }
 
 template <typename vertex_t>
@@ -154,17 +181,21 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
   auto const minor_comm_size = minor_comm.get_size();
 
-  auto return_value = shuffle_vertices_and_values_by_gpu_id_impl(
+  rmm::device_uvector<vertex_t> d_vertices(0, handle.get_stream());
+  rmm::device_uvector<value_t> d_values(0, handle.get_stream());
+
+  std::tie(d_vertices, d_values, std::ignore) = shuffle_vertices_and_values_by_gpu_id_impl(
     handle,
     std::move(vertices),
     std::move(values),
+    std::optional<rmm::device_uvector<int32_t>>{std::nullopt},
     cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
       raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
                                         d_vertex_partition_range_lasts.size()),
       major_comm_size,
       minor_comm_size});
 
-  return return_value;
+  return std::make_tuple(std::move(d_vertices), std::move(d_values));
 }
 
 }  // namespace detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3752e823659..a2eeafea8cf 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -483,10 +483,29 @@ ConfigureTest(RANDOM_WALKS_TEST sampling/sg_random_walks_test.cpp)
 # - UNIFORM NBR SAMPLING tests --------------------------------------------------------------------
 ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/uniform_neighbor_sampling.cpp)
 
+# - HOMOGENEOUS UNIFORM NBR SAMPLING tests --------------------------------------------------------
+ConfigureTest(
+    HOMOGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/homogeneous_uniform_neighbor_sampling.cpp)
+
+# - HETEROGENEOUS UNIFORM NBR SAMPLING tests -----------------------------------------------------
+ConfigureTest(
+    HETEROGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/heterogeneous_uniform_neighbor_sampling.cpp)
+
 ###################################################################################################
 # - BIASED NBR SAMPLING tests ---------------------------------------------------------------------
 ConfigureTest(BIASED_NEIGHBOR_SAMPLING_TEST sampling/biased_neighbor_sampling.cpp)
 
+###################################################################################################
+# - HOMOGENEOUS BIASED NBR SAMPLING tests ---------------------------------------------------------
+ConfigureTest(
+    HOMOGENEOUS_BIASED_NEIGHBOR_SAMPLING_TEST sampling/homogeneous_biased_neighbor_sampling.cpp)
+
+###################################################################################################
+# - HETEROGENEOUS BIASED NBR SAMPLING tests -------------------------------------------------------
+ConfigureTest(
+    HETEROGENEOUS_BIASED_NEIGHBOR_SAMPLING_TESTT sampling/heterogeneous_biased_neighbor_sampling.cpp
+        GPUS 1 PERCENT 75)
+
 ###################################################################################################
 # - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
 ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cpp)
@@ -751,6 +770,26 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG UNIFORM NBR SAMPLING tests -------------------------------------------------------------
     ConfigureTestMG(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_uniform_neighbor_sampling.cpp)
 
+    ###############################################################################################
+    # - MG HOMOGENEOUS UNIFORM NBR SAMPLING tests -------------------------------------------------
+    ConfigureTestMG(
+        MG_HOMOGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_homogeneous_uniform_neighbor_sampling.cpp)
+
+    ###############################################################################################
+    # - MG HETEROGENEOUS UNIFORM NBR SAMPLING tests -------------------------------------------------
+    ConfigureTestMG(
+        MG_HETEROGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp)
+
+    ###############################################################################################
+    # - MG HOMOGENEOUS BIASED NBR SAMPLING tests --------------------------------------------------
+    ConfigureTestMG(
+        MG_HOMOGENEOUS_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_homogeneous_biased_neighbor_sampling.cpp)
+
+    ###############################################################################################
+    # - MG HETEROGENEOUS BIASED NBR SAMPLING tests --------------------------------------------------
+    ConfigureTestMG(
+        MG_HETEROGENEOUS_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_heterogeneous_biased_neighbor_sampling.cpp)
+
     ###############################################################################################
     # - MG BIASED NBR SAMPLING tests --------------------------------------------------------------
     ConfigureTestMG(MG_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_biased_neighbor_sampling.cpp)
diff --git a/cpp/tests/community/balanced_edge_test.cpp b/cpp/tests/community/balanced_edge_test.cpp
index c4488dc9b9e..614a4ee4190 100644
--- a/cpp/tests/community/balanced_edge_test.cpp
+++ b/cpp/tests/community/balanced_edge_test.cpp
@@ -15,7 +15,7 @@
 #include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
-TEST(balanced_edge, success)
+TEST(balanced_edge, DISABLED_success)
 {
   std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
                             67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
diff --git a/cpp/tests/link_prediction/mg_similarity_test.cpp b/cpp/tests/link_prediction/mg_similarity_test.cpp
index 302248fe516..87214c808da 100644
--- a/cpp/tests/link_prediction/mg_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_similarity_test.cpp
@@ -29,7 +29,10 @@
 struct Similarity_Usecase {
   bool use_weights{false};
   bool check_correctness{true};
-  size_t max_seeds{std::numeric_limits<size_t>::max()};
+  bool all_pairs{false};
+  std::optional<size_t> max_seeds{std::nullopt};
+  std::optional<size_t> max_vertex_pairs_to_check{std::nullopt};
+  std::optional<size_t> topk{std::nullopt};
 };
 
 template <typename input_usecase_t>
@@ -80,56 +83,96 @@ class Tests_MGSimilarity
     auto mg_edge_weight_view =
       mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
 
-    rmm::device_uvector<vertex_t> d_start_vertices(
-      std::min(
-        static_cast<size_t>(mg_graph_view.local_vertex_partition_range_size()),
-        similarity_usecase.max_seeds / comm_size +
-          (static_cast<size_t>(comm_rank) < similarity_usecase.max_seeds % comm_size ? 1 : 0)),
-      handle_->get_stream());
-    cugraph::test::populate_vertex_ids(
-      *handle_, d_start_vertices, mg_graph_view.local_vertex_partition_range_first());
-
-    auto [d_offsets, two_hop_nbrs] = cugraph::k_hop_nbrs(
-      *handle_,
-      mg_graph_view,
-      raft::device_span<vertex_t const>(d_start_vertices.data(), d_start_vertices.size()),
-      2);
-
-    auto h_start_vertices = cugraph::test::to_host(*handle_, d_start_vertices);
-    auto h_offsets        = cugraph::test::to_host(*handle_, d_offsets);
-
-    std::vector<vertex_t> h_v1(h_offsets.back());
-    for (size_t i = 0; i < h_start_vertices.size(); ++i) {
-      std::fill(h_v1.begin() + h_offsets[i], h_v1.begin() + h_offsets[i + 1], h_start_vertices[i]);
-    }
+    rmm::device_uvector<vertex_t> v1(0, handle_->get_stream());
+    rmm::device_uvector<vertex_t> v2(0, handle_->get_stream());
+    rmm::device_uvector<weight_t> result_score(0, handle_->get_stream());
 
-    auto d_v1 = cugraph::test::to_device(*handle_, h_v1);
-    auto d_v2 = std::move(two_hop_nbrs);
-
-    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore, std::ignore) =
-      cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
-        vertex_t,
-        edge_t,
-        weight_t,
-        int32_t>(*handle_,
-                 std::move(d_v1),
-                 std::move(d_v2),
-                 std::nullopt,
-                 std::nullopt,
-                 std::nullopt,
-                 mg_graph_view.vertex_partition_range_lasts());
-
-    std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs{
-      {d_v1.data(), d_v1.size()}, {d_v2.data(), d_v2.size()}};
+    raft::random::RngState rng_state{0};
 
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      handle_->get_comms().barrier();
-      hr_timer.start("MG similarity test");
+    rmm::device_uvector<vertex_t> sources(0, handle_->get_stream());
+    std::optional<raft::device_span<vertex_t const>> sources_span{std::nullopt};
+
+    if (similarity_usecase.max_seeds) {
+      sources = cugraph::select_random_vertices(
+        *handle_,
+        mg_graph_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        rng_state,
+        std::min(*similarity_usecase.max_seeds,
+                 static_cast<size_t>(mg_graph_view.number_of_vertices())),
+        false,
+        false);
+      sources_span = raft::device_span<vertex_t const>{sources.data(), sources.size()};
     }
 
-    auto result_score = test_functor.run(
-      *handle_, mg_graph_view, mg_edge_weight_view, vertex_pairs, similarity_usecase.use_weights);
+    if (similarity_usecase.all_pairs) {
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        handle_->get_comms().barrier();
+        hr_timer.start("MG similarity test");
+      }
+
+      std::tie(v1, v2, result_score) = test_functor.run(*handle_,
+                                                        mg_graph_view,
+                                                        mg_edge_weight_view,
+                                                        sources_span,
+                                                        similarity_usecase.use_weights,
+                                                        similarity_usecase.topk);
+    } else {
+      if (!sources_span) {
+        sources.resize(mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream());
+        cugraph::test::populate_vertex_ids(
+          *handle_, sources, mg_graph_view.local_vertex_partition_range_first());
+        sources_span = raft::device_span<vertex_t const>{sources.data(), sources.size()};
+      }
+
+      rmm::device_uvector<size_t> offsets(0, handle_->get_stream());
+
+      std::tie(offsets, v2) = cugraph::k_hop_nbrs(*handle_, mg_graph_view, *sources_span, 2);
+
+      v1.resize(v2.size(), handle_->get_stream());
+      cugraph::test::expand_sparse_offsets(
+        *handle_,
+        raft::device_span<size_t const>{offsets.data(), offsets.size()},
+        raft::device_span<vertex_t>{v1.data(), v1.size()},
+        size_t{0},
+        vertex_t{0});
+
+      cugraph::unrenumber_local_int_vertices(*handle_,
+                                             v1.data(),
+                                             v1.size(),
+                                             sources.data(),
+                                             vertex_t{0},
+                                             static_cast<vertex_t>(sources.size()),
+                                             true);
+
+      std::tie(v1, v2) = cugraph::test::remove_self_loops(*handle_, std::move(v1), std::move(v2));
+
+      std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
+        cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
+          vertex_t,
+          edge_t,
+          weight_t,
+          int32_t>(*handle_,
+                   std::move(v1),
+                   std::move(v2),
+                   std::nullopt,
+                   std::nullopt,
+                   std::nullopt,
+                   mg_graph_view.vertex_partition_range_lasts());
+
+      std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs{
+        {v1.data(), v1.size()}, {v2.data(), v2.size()}};
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        handle_->get_comms().barrier();
+        hr_timer.start("MG similarity test");
+      }
+
+      result_score = test_functor.run(
+        *handle_, mg_graph_view, mg_edge_weight_view, vertex_pairs, similarity_usecase.use_weights);
+    }
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -147,14 +190,14 @@ class Tests_MGSimilarity
         mg_edge_weight_view,
         std::optional<raft::device_span<vertex_t const>>(std::nullopt));
 
-      d_v1 = cugraph::test::device_gatherv(*handle_, d_v1.data(), d_v1.size());
-      d_v2 = cugraph::test::device_gatherv(*handle_, d_v2.data(), d_v2.size());
+      v1 = cugraph::test::device_gatherv(*handle_, v1.data(), v1.size());
+      v2 = cugraph::test::device_gatherv(*handle_, v2.data(), v2.size());
       result_score =
         cugraph::test::device_gatherv(*handle_, result_score.data(), result_score.size());
 
-      if (d_v1.size() > 0) {
-        auto h_vertex_pair1 = cugraph::test::to_host(*handle_, d_v1);
-        auto h_vertex_pair2 = cugraph::test::to_host(*handle_, d_v2);
+      if (v1.size() > 0) {
+        auto h_vertex_pair1 = cugraph::test::to_host(*handle_, v1);
+        auto h_vertex_pair2 = cugraph::test::to_host(*handle_, v2);
         auto h_result_score = cugraph::test::to_host(*handle_, result_score);
 
         similarity_compare(mg_graph_view.number_of_vertices(),
@@ -258,10 +301,13 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGSimilarity_File,
   ::testing::Combine(
-    // enable correctness checks
-    // Disable weighted computation testing in 22.10
-    //::testing::Values(Similarity_Usecase{true, true, 20}, Similarity_Usecase{false, true, 20}),
-    ::testing::Values(Similarity_Usecase{false, true, 20}),
+    ::testing::Values(Similarity_Usecase{false, true, false, 20, 100},
+                      Similarity_Usecase{false, true, false, 20, 100},
+                      Similarity_Usecase{false, true, false, 20, 100, 10},
+                      Similarity_Usecase{false, true, true, 20, 100},
+                      Similarity_Usecase{false, true, true, 20, 100},
+                      Similarity_Usecase{false, true, true, std::nullopt, 100, 10},
+                      Similarity_Usecase{false, true, true, 20, 100, 10}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
 
@@ -273,7 +319,13 @@ INSTANTIATE_TEST_SUITE_P(
     // Disable weighted computation testing in 22.10
     //::testing::Values(Similarity_Usecase{true, true, 20},
     // Similarity_Usecase{false, true, 20}),
-    ::testing::Values(Similarity_Usecase{false, true, 20}),
+    ::testing::Values(Similarity_Usecase{false, true, false, 20, 100},
+                      Similarity_Usecase{false, true, false, 20, 100},
+                      Similarity_Usecase{false, true, false, 20, 100, 10},
+                      Similarity_Usecase{false, true, true, 20, 100},
+                      Similarity_Usecase{false, true, true, 20, 100},
+                      Similarity_Usecase{false, true, true, std::nullopt, 100, 10},
+                      Similarity_Usecase{false, true, true, 20, 100, 10}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -285,7 +337,12 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGSimilarity_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Similarity_Usecase{false, false, 20}),
+    ::testing::Values(Similarity_Usecase{false, true, false, 20, 100},
+                      Similarity_Usecase{false, true, false, 20, 100},
+                      Similarity_Usecase{false, true, false, 20, 100, 10},
+                      Similarity_Usecase{false, true, true, 20, 100},
+                      Similarity_Usecase{false, true, true, 20, 100},
+                      Similarity_Usecase{false, true, true, 20, 100, 10}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 16, 0.57, 0.19, 0.19, 0, true, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_prediction/similarity_test.cu b/cpp/tests/link_prediction/similarity_test.cu
index ec6db102830..0c4f526264f 100644
--- a/cpp/tests/link_prediction/similarity_test.cu
+++ b/cpp/tests/link_prediction/similarity_test.cu
@@ -83,11 +83,6 @@ class Tests_Similarity
     auto edge_weight_view =
       edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
 
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.start("Similarity test");
-    }
-
     rmm::device_uvector<vertex_t> v1(0, handle.get_stream());
     rmm::device_uvector<vertex_t> v2(0, handle.get_stream());
     rmm::device_uvector<weight_t> result_score(0, handle.get_stream());
@@ -111,6 +106,11 @@ class Tests_Similarity
     }
 
     if (similarity_usecase.all_pairs) {
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Similarity test");
+      }
+
       std::tie(v1, v2, result_score) = test_functor.run(handle,
                                                         graph_view,
                                                         edge_weight_view,
@@ -141,21 +141,17 @@ class Tests_Similarity
                                              static_cast<vertex_t>(sources.size()),
                                              true);
 
-      auto new_size = thrust::distance(
-        thrust::make_zip_iterator(v1.begin(), v2.begin()),
-        thrust::remove_if(
-          handle.get_thrust_policy(),
-          thrust::make_zip_iterator(v1.begin(), v2.begin()),
-          thrust::make_zip_iterator(v1.end(), v2.end()),
-          [] __device__(auto tuple) { return thrust::get<0>(tuple) == thrust::get<1>(tuple); }));
-
-      v1.resize(new_size, handle.get_stream());
-      v2.resize(new_size, handle.get_stream());
+      std::tie(v1, v2) = cugraph::test::remove_self_loops(handle, std::move(v1), std::move(v2));
 
       // FIXME:  Need to add some tests that specify actual vertex pairs
       std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs{
         {v1.data(), v1.size()}, {v2.data(), v2.size()}};
 
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Similarity test");
+      }
+
       result_score = test_functor.run(
         handle, graph_view, edge_weight_view, vertex_pairs, similarity_usecase.use_weights);
     }
diff --git a/cpp/tests/sampling/heterogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/heterogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..6ea00cf5104
--- /dev/null
+++ b/cpp/tests/sampling/heterogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool flag_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Heterogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Heterogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    constexpr float select_probability{0.05};
+
+    // FIXME:  Update the tests to initialize RngState and use it instead
+    //         of seed...
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number =
+      cugraph::test::sequence(handle,
+                              random_sources.size(),
+                              heterogeneous_biased_neighbor_sampling_usecase.batch_size,
+                              int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), edge_type_t>> edge_types{
+      std::nullopt};
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(graph_view), edge_type_t>::edge_property(
+        handle, graph_view, heterogeneous_biased_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Biased neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_biased_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        *edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_biased_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  heterogeneous_biased_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Heterogeneous_Biased_Neighbor_Sampling_File =
+  Tests_Heterogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_Heterogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Heterogeneous_Biased_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, false, false},
+      Heterogeneous_Biased_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+// #endif
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/heterogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/heterogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..3b57aed4768
--- /dev/null
+++ b/cpp/tests/sampling/heterogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool flag_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Heterogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    constexpr float select_probability{0.05};
+
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number =
+      cugraph::test::sequence(handle,
+                              random_sources.size(),
+                              heterogeneous_uniform_neighbor_sampling_usecase.batch_size,
+                              int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), int32_t>> edge_types{std::nullopt};
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(graph_view), int32_t>::edge_property(
+        handle, graph_view, heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Uniform neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_uniform_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{
+          cugraph::prior_sources_behavior_t{0},
+          true,   // return_hops
+          false,  // dedupe_sources
+          heterogeneous_uniform_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Heterogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, false, false},
+      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+// #endif
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/homogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/homogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..14cf54e7d1c
--- /dev/null
+++ b/cpp/tests/sampling/homogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool flag_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Homogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Homogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    auto [homogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_biased_neighbor_sampling_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<decltype(graph_view), bool>::edge_property(handle, graph_view, 2);
+      graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    constexpr float select_probability{0.05};
+
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number = cugraph::test::sequence(handle,
+                                           random_sources.size(),
+                                           homogeneous_biased_neighbor_sampling_usecase.batch_size,
+                                           int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Biased neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_biased_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        *edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(homogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_biased_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          homogeneous_biased_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Homogeneous_Biased_Neighbor_Sampling_File =
+  Tests_Homogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_Homogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Homogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Homogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, true, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/homogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/homogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..a257e424b3e
--- /dev/null
+++ b/cpp/tests/sampling/homogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool flag_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Homogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Homogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    auto [homogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_uniform_neighbor_sampling_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<decltype(graph_view), bool>::edge_property(handle, graph_view, 2);
+      graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // FIXME: Read a tuple of two edge mask and mask out if edge mask is set in either 1 (OR) and
+    // create a new one. No graph view can have two mask and perform OR in itself, and need to OR
+    // the mask manually by itself.
+
+    constexpr float select_probability{0.05};
+
+    // FIXME:  Update the tests to initialize RngState and use it instead
+    //         of seed...
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number = cugraph::test::sequence(handle,
+                                           random_sources.size(),
+                                           homogeneous_uniform_neighbor_sampling_usecase.batch_size,
+                                           int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Uniform neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_uniform_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(homogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_uniform_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          homogeneous_uniform_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Homogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_Homogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_Homogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, true, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_heterogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/mg_heterogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..18d8491435d
--- /dev/null
+++ b/cpp/tests/sampling/mg_heterogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool with_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHeterogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches =
+      (num_seeds + heterogeneous_biased_neighbor_sampling_usecase.batch_size - 1) /
+      heterogeneous_biased_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), edge_type_t>> edge_types{
+      std::nullopt};
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(mg_graph_view), edge_type_t>::edge_property(
+        *handle_, mg_graph_view, heterogeneous_biased_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG biased_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_biased_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        *mg_edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_biased_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  heterogeneous_biased_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              heterogeneous_biased_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File =
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..b6812b35170
--- /dev/null
+++ b/cpp/tests/sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool with_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHeterogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches =
+      (num_seeds + heterogeneous_uniform_neighbor_sampling_usecase.batch_size - 1) /
+      heterogeneous_uniform_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), edge_type_t>> edge_types{
+      std::nullopt};
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(mg_graph_view), edge_type_t>::edge_property(
+        *handle_, mg_graph_view, heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG uniform_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_uniform_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{
+          cugraph::prior_sources_behavior_t{0},
+          true,   // return_hops
+          false,  // dedupe_sources
+          heterogeneous_uniform_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              heterogeneous_uniform_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_homogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/mg_homogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..ce153fd3f75
--- /dev/null
+++ b/cpp/tests/sampling/mg_homogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool with_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHomogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    auto [homogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_biased_neighbor_sampling_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds   = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches = (num_seeds + homogeneous_biased_neighbor_sampling_usecase.batch_size - 1) /
+                         homogeneous_biased_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG homogeneous_biased_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_biased_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        *mg_edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(homogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_biased_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              homogeneous_biased_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHomogeneous_Biased_Neighbor_Sampling_File =
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHomogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_homogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/mg_homogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..88f2b8e28c8
--- /dev/null
+++ b/cpp/tests/sampling/mg_homogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool with_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHomogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    auto [homogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_uniform_neighbor_sampling_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches =
+      (num_seeds + homogeneous_uniform_neighbor_sampling_usecase.batch_size - 1) /
+      homogeneous_uniform_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG uniform_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_uniform_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(homogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_uniform_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              homogeneous_uniform_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index ef1c4f831eb..095cd15872b 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -41,6 +41,7 @@ namespace test {
 
 template <typename value_t>
 cugraph::dataframe_buffer_type_t<value_t> sort(
+
   raft::handle_t const& handle, cugraph::dataframe_buffer_type_t<value_t> const& values)
 {
   auto sorted_values = cugraph::allocate_dataframe_buffer<value_t>(
@@ -403,6 +404,25 @@ template rmm::device_uvector<int64_t> sequence(raft::handle_t const& handle,
                                                size_t repeat_count,
                                                int64_t init);
 
+template <typename value_t>
+cugraph::dataframe_buffer_type_t<value_t> scalar_fill(raft::handle_t const& handle,
+                                                      size_t length,
+                                                      value_t value)
+{
+  auto values = cugraph::allocate_dataframe_buffer<value_t>(length, handle.get_stream());
+
+  thrust::tabulate(
+    handle.get_thrust_policy(), values.begin(), values.end(), [value] __device__(size_t i) {
+      return value;
+    });
+
+  return values;
+}
+
+template rmm::device_uvector<int32_t> scalar_fill(raft::handle_t const& handle,
+                                                  size_t length,
+                                                  int32_t value);
+
 template <typename value_t>
 cugraph::dataframe_buffer_type_t<value_t> modulo_sequence(raft::handle_t const& handle,
                                                           size_t length,
@@ -546,5 +566,35 @@ template void expand_hypersparse_offsets(raft::handle_t const& handle,
                                          raft::device_span<int64_t> indices,
                                          size_t base_offset);
 
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> remove_self_loops(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& v1,
+  rmm::device_uvector<vertex_t>&& v2)
+{
+  auto new_size = thrust::distance(
+    thrust::make_zip_iterator(v1.begin(), v2.begin()),
+    thrust::remove_if(
+      handle.get_thrust_policy(),
+      thrust::make_zip_iterator(v1.begin(), v2.begin()),
+      thrust::make_zip_iterator(v1.end(), v2.end()),
+      [] __device__(auto tuple) { return thrust::get<0>(tuple) == thrust::get<1>(tuple); }));
+
+  v1.resize(new_size, handle.get_stream());
+  v2.resize(new_size, handle.get_stream());
+
+  return std::make_tuple(std::move(v1), std::move(v2));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> remove_self_loops(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& v1,
+  rmm::device_uvector<int32_t>&& v2);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> remove_self_loops(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& v1,
+  rmm::device_uvector<int64_t>&& v2);
+
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index afdff33d80a..b6c8052e6b5 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -73,6 +73,11 @@ cugraph::dataframe_buffer_type_t<value_t> sequence(raft::handle_t const& handle,
                                                    size_t repeat_count,
                                                    value_t init);
 
+template <typename value_t>
+cugraph::dataframe_buffer_type_t<value_t> scalar_fill(raft::handle_t const& handle,
+                                                      size_t length,
+                                                      value_t value);
+
 // return (init + i) % modulo, where i = [0, length)
 template <typename value_t>
 cugraph::dataframe_buffer_type_t<value_t> modulo_sequence(raft::handle_t const& handle,
@@ -107,5 +112,11 @@ void expand_hypersparse_offsets(raft::handle_t const& handle,
                                 raft::device_span<idx_t> indices,
                                 offset_t base_offset);
 
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> remove_self_loops(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& v1,
+  rmm::device_uvector<vertex_t>&& v2);
+
 }  // namespace test
 }  // namespace cugraph
diff --git a/dependencies.yaml b/dependencies.yaml
index 36c7a01250e..3b53aecec01 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -26,14 +26,10 @@ files:
       - depends_on_cupy
       - depends_on_pytorch
       - python_run_cugraph
-      - python_run_nx_cugraph
-      - python_run_cugraph_dgl
-      - python_run_cugraph_pyg
       - test_notebook
       - test_python_common
       - test_python_cugraph
       - test_python_pylibcugraph
-      - test_python_nx_cugraph
   checks:
     output: none
     includes:
@@ -150,110 +146,6 @@ files:
       - depends_on_cudf
       - test_python_common
       - test_python_pylibcugraph
-  py_build_nx_cugraph:
-    output: pyproject
-    pyproject_dir: python/nx-cugraph
-    extras:
-      table: build-system
-    includes:
-      - python_build_rapids
-      - python_build_wheel
-  py_run_nx_cugraph:
-    output: pyproject
-    pyproject_dir: python/nx-cugraph
-    extras:
-      table: project
-    includes:
-      - depends_on_pylibcugraph
-      - depends_on_cupy
-      - python_run_nx_cugraph
-  py_test_nx_cugraph:
-    output: pyproject
-    pyproject_dir: python/nx-cugraph
-    extras:
-      table: project.optional-dependencies
-      key: test
-    includes:
-      - test_python_common
-      - test_python_nx_cugraph
-  py_build_cugraph_dgl:
-    output: pyproject
-    pyproject_dir: python/cugraph-dgl
-    extras:
-      table: build-system
-    includes:
-      - python_build_rapids
-      - python_build_wheel
-  py_run_cugraph_dgl:
-    output: pyproject
-    pyproject_dir: python/cugraph-dgl
-    extras:
-      table: project
-    includes:
-      - python_run_cugraph_dgl
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-  py_test_cugraph_dgl:
-    output: pyproject
-    pyproject_dir: python/cugraph-dgl
-    extras:
-      table: project.optional-dependencies
-      key: test
-    includes:
-      - test_python_common
-      - depends_on_pylibwholegraph
-      - depends_on_pytorch
-  py_build_cugraph_pyg:
-    output: pyproject
-    pyproject_dir: python/cugraph-pyg
-    extras:
-      table: build-system
-    includes:
-      - python_build_rapids
-      - python_build_wheel
-  py_run_cugraph_pyg:
-    output: pyproject
-    pyproject_dir: python/cugraph-pyg
-    extras:
-      table: project
-    includes:
-      - python_run_cugraph_pyg
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-  py_test_cugraph_pyg:
-    output: pyproject
-    pyproject_dir: python/cugraph-pyg
-    extras:
-      table: project.optional-dependencies
-      key: test
-    includes:
-      - test_python_common
-      - depends_on_pylibwholegraph
-      - depends_on_pytorch
-  py_build_cugraph_equivariant:
-    output: pyproject
-    pyproject_dir: python/cugraph-equivariant
-    extras:
-      table: build-system
-    includes:
-      - python_build_rapids
-      - python_build_wheel
-  py_run_cugraph_equivariant:
-    output: pyproject
-    pyproject_dir: python/cugraph-equivariant
-    extras:
-      table: project
-    includes:
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-  py_test_cugraph_equivariant:
-    output: pyproject
-    pyproject_dir: python/cugraph-equivariant
-    extras:
-      table: project.optional-dependencies
-      key: test
-    includes:
-      - test_python_common
   py_build_cugraph_service_client:
     output: pyproject
     pyproject_dir: python/cugraph-service/client
@@ -297,33 +189,10 @@ files:
     includes:
       - test_python_common
       - test_python_cugraph
-  cugraph_dgl_dev:
-    matrix:
-      cuda: ["11.8"]
-    output: conda
-    conda_dir: python/cugraph-dgl/conda
-    includes:
-      - checks
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-      - cugraph_dgl_dev
-      - test_python_common
-  cugraph_pyg_dev:
-    matrix:
-      cuda: ["11.8"]
-    output: conda
-    conda_dir: python/cugraph-pyg/conda
-    includes:
-      - checks
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-      - cugraph_pyg_dev
-      - test_python_common
 channels:
   - rapidsai
   - rapidsai-nightly
   - dask/label/dev
-  - dglteam/label/th23_cu118
   - conda-forge
   - nvidia
 dependencies:
@@ -553,56 +422,6 @@ dependencies:
           - matrix:
             packages:
               - *ucx_py_unsuffixed
-  python_run_nx_cugraph:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-          - networkx>=3.0
-          - *numpy
-  python_run_cugraph_dgl:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-          - *numba
-          - *numpy
-    specific:
-      - output_types: [pyproject]
-        matrices:
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &cugraph_cu11 cugraph-cu11==25.2.*,>=0.0.0a0
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &cugraph_cu12 cugraph-cu12==25.2.*,>=0.0.0a0
-          - matrix:
-            packages:
-              - &cugraph_unsuffixed cugraph==25.2.*,>=0.0.0a0
-  python_run_cugraph_pyg:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-          - *numba
-          - *numpy
-    specific:
-      - output_types: [pyproject]
-        matrices:
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - *cugraph_cu11
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - *cugraph_cu12
-          - matrix:
-            packages:
-              - *cugraph_unsuffixed
   python_run_cugraph_service_client:
     common:
       - output_types: [conda, pyproject]
@@ -627,19 +446,19 @@ dependencies:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - *cugraph_cu11
+              - &cugraph_cu11 cugraph-cu11==25.2.*,>=0.0.0a0
               - cugraph-service-client-cu11==25.2.*,>=0.0.0a0
               - *ucx_py_cu11
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *cugraph_cu12
+              - &cugraph_cu12 cugraph-cu12==25.2.*,>=0.0.0a0
               - cugraph-service-client-cu12==25.2.*,>=0.0.0a0
               - *ucx_py_cu12
           - matrix:
             packages:
-              - *cugraph_unsuffixed
+              - &cugraph_unsuffixed cugraph==25.2.*,>=0.0.0a0
               - cugraph-service-client==25.2.*,>=0.0.0a0
               - *ucx_py_unsuffixed
   test_cpp:
@@ -683,38 +502,12 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - *numpy
-  test_python_nx_cugraph:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-            # not needed by nx-cugraph tests, but is required for running networkx tests
-          - pytest-mpl
-  cugraph_dgl_dev:
-    common:
-      - output_types: [conda]
-        packages:
-          - *cugraph_unsuffixed
-          # ceiling could be removed when this is fixed:
-          # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/254
-          - &pytorch_conda pytorch>=2.3,<2.4.0a0
-          - pytorch-cuda==11.8
-          - &tensordict tensordict>=0.1.2
-          - dgl>=2.4.0.cu*
-  cugraph_pyg_dev:
-    common:
-      - output_types: [conda]
-        packages:
-          - *cugraph_unsuffixed
-          - *pytorch_conda
-          - pytorch-cuda==11.8
-          - *tensordict
-          - pytorch_geometric>=2.5,<2.6
 
   depends_on_pytorch:
     common:
       - output_types: [conda]
         packages:
-          - *pytorch_conda
+          - &pytorch_conda pytorch>=2.3
           - torchdata
           - pydantic
           - ogb
@@ -734,8 +527,8 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &pytorch_pip torch>=2.3,<2.4.0a0
-              - *tensordict
+              - &pytorch_pip torch>=2.3
+              - &tensordict tensordict>=0.1.2
           - matrix: {cuda: "11.*"}
             packages:
               - *pytorch_pip
diff --git a/docs/cugraph/source/installation/getting_cugraph.md b/docs/cugraph/source/installation/getting_cugraph.md
index 01bc9e379c9..0c553acf964 100644
--- a/docs/cugraph/source/installation/getting_cugraph.md
+++ b/docs/cugraph/source/installation/getting_cugraph.md
@@ -31,7 +31,6 @@ cuGraph Conda packages
  * cugraph-service-server
  * cugraph-dgl
  * cugraph-pyg
- * cugraph-equivariant
  * nx-cugraph
 
 Replace the package name in the example below to the one you want to install.
@@ -61,7 +60,6 @@ Replace `-cu12` with `-cu11` for packages supporting CUDA 11.
 Also available:
  * cugraph-dgl-cu12
  * cugraph-pyg-cu12
- * cugraph-equivariant-cu12
  * nx-cugraph-cu12
 
 <br>
diff --git a/docs/cugraph/source/nx_cugraph/benchmarks.md b/docs/cugraph/source/nx_cugraph/benchmarks.md
index 45085c133a9..9e0718159fa 100644
--- a/docs/cugraph/source/nx_cugraph/benchmarks.md
+++ b/docs/cugraph/source/nx_cugraph/benchmarks.md
@@ -9,7 +9,7 @@ We ran several commonly used graph algorithms on both `networkx` and `nx-cugraph
 ![bench-image](../_static/bc_benchmark.png)
 
 <figcaption style="text-align: center;">Results from running this <a
-href="https://github.com/rapidsai/cugraph/blob/HEAD/benchmarks/nx-cugraph/pytest-based/bench_algos.py">Benchmark</a><span
+href="https://github.com/rapidsai/nx-cugraph/blob/HEAD/benchmarks/nx-cugraph/pytest-based/bench_algos.py">Benchmark</a><span
 class="title-ref"></span></figcaption>
 </figure>
 
@@ -23,4 +23,4 @@ Below are the steps to reproduce the results on your own.
 
 4. Install the latest `nx-cugraph` by following the [Installation Guide](installation.md)
 
-5. Follow the instructions written in the README [here](https://github.com/rapidsai/cugraph/blob/HEAD/benchmarks/nx-cugraph/pytest-based)
+5. Follow the instructions written in the README [here](https://github.com/rapidsai/nx-cugraph/blob/HEAD/benchmarks/nx-cugraph/pytest-based/README.md)
diff --git a/docs/cugraph/source/nx_cugraph/how-it-works.md b/docs/cugraph/source/nx_cugraph/how-it-works.md
index 5696688d1b5..88788f3c0cc 100644
--- a/docs/cugraph/source/nx_cugraph/how-it-works.md
+++ b/docs/cugraph/source/nx_cugraph/how-it-works.md
@@ -110,4 +110,4 @@ This run will be much faster, typically around 5 seconds depending on your GPU.
 
 ---
 
-The latest list of algorithms supported by `nx-cugraph` can be found in [GitHub](https://github.com/rapidsai/cugraph/blob/HEAD/python/nx-cugraph/README.md#algorithms), or in the [Supported Algorithms Section](supported-algorithms.md).
+The latest list of algorithms supported by `nx-cugraph` can be found in [GitHub](https://github.com/rapidsai/nx-cugraph/blob/HEAD/README.md#supported-algorithms), or in the [Supported Algorithms Section](supported-algorithms.md).
diff --git a/docs/cugraph/source/nx_cugraph/index.rst b/docs/cugraph/source/nx_cugraph/index.rst
index 730958a5b73..50565c805a9 100644
--- a/docs/cugraph/source/nx_cugraph/index.rst
+++ b/docs/cugraph/source/nx_cugraph/index.rst
@@ -3,7 +3,7 @@ nx-cugraph
 
 ``nx-cugraph`` is a NetworkX backend that provides **GPU acceleration** to many popular NetworkX algorithms.
 
-By simply `installing and enabling nx-cugraph <https://github.com/rapidsai/cugraph/blob/HEAD/python/nx-cugraph/README.md#install>`_, users can see significant speedup on workflows where performance is hindered by the default NetworkX implementation.
+By simply `installing and enabling nx-cugraph <https://docs.rapids.ai/api/cugraph/stable/nx_cugraph/installation/>`_, users can see significant speedup on workflows where performance is hindered by the default NetworkX implementation.
 
 Users can have GPU-based, large-scale performance **without** changing their familiar and easy-to-use NetworkX code.
 
diff --git a/docs/cugraph/source/nx_cugraph/supported-algorithms.rst b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
index 8f57c02b240..ae32bc330fe 100644
--- a/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
+++ b/docs/cugraph/source/nx_cugraph/supported-algorithms.rst
@@ -352,4 +352,4 @@ Generators
 
 
 To request nx-cugraph backend support for a NetworkX API that is not listed
-above, visit the `cuGraph GitHub repo <https://github.com/rapidsai/cugraph>`_.
+above, visit the `nx-cugraph GitHub repo <https://github.com/rapidsai/nx-cugraph>`_.
diff --git a/notebooks/cugraph_benchmarks/nx_cugraph_benchmark.ipynb b/notebooks/cugraph_benchmarks/nx_cugraph_benchmark.ipynb
deleted file mode 100644
index bc57947f200..00000000000
--- a/notebooks/cugraph_benchmarks/nx_cugraph_benchmark.ipynb
+++ /dev/null
@@ -1,365 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Benchmarking Performance of NetworkX without and with the RAPIDS GPU-based nx-cugraph backend\n",
-    "\n",
-    "This notebook collects the run-times without and with the nx-cugraph backend enabled for three popular NetworkX algorithms: Betweenness Centrality, Breadth First Search, and Louvain Community Detection.\n",
-    "\n",
-    "Here is a sample minimal script to demonstrate no-code-change GPU acceleration using nx-cugraph.\n",
-    "\n",
-    "----\n",
-    "bc_demo.ipy:\n",
-    "\n",
-    "```\n",
-    "import pandas as pd\n",
-    "import networkx as nx\n",
-    "\n",
-    "url = \"https://data.rapids.ai/cugraph/datasets/cit-Patents.csv\"\n",
-    "df = pd.read_csv(url, sep=\" \", names=[\"src\", \"dst\"], dtype=\"int32\")\n",
-    "G = nx.from_pandas_edgelist(df, source=\"src\", target=\"dst\")\n",
-    "\n",
-    "%time result = nx.betweenness_centrality(G, k=10)\n",
-    "```\n",
-    "----\n",
-    "Running it with the nx-cugraph backend looks like this:\n",
-    "```\n",
-    "user@machine:/# ipython bc_demo.ipy\n",
-    "CPU times: user 7min 38s, sys: 5.6 s, total: 7min 44s\n",
-    "Wall time: 7min 44s\n",
-    "\n",
-    "user@machine:/# NETWORKX_BACKEND_PRIORITY=cugraph ipython bc_demo.ipy\n",
-    "CPU times: user 18.4 s, sys: 1.44 s, total: 19.9 s\n",
-    "Wall time: 20 s\n",
-    "```\n",
-    "----\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "First import the needed packages"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import pandas as pd\n",
-    "import networkx as nx"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This installs nx-cugraph if not already present."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "try: \n",
-    "    import nx_cugraph\n",
-    "except ModuleNotFoundError:\n",
-    "    os.system('conda install -c rapidsai -c conda-forge -c nvidia nx-cugraph')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Download a patent citation dataset containing 3774768 nodes and 16518948 edges and loads it into a NetworkX graph."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "File ./data/cit-Patents.csv not found, downloading https://data.rapids.ai/cugraph/datasets/cit-Patents.csv\n"
-     ]
-    }
-   ],
-   "source": [
-    "filepath = \"./data/cit-Patents.csv\"\n",
-    "\n",
-    "if os.path.exists(filepath):\n",
-    "    url = filepath\n",
-    "else:\n",
-    "    url = \"https://data.rapids.ai/cugraph/datasets/cit-Patents.csv\"\n",
-    "    print(f\"File {filepath} not found, downloading {url}\")\n",
-    "\n",
-    "df = pd.read_csv(url, sep=\" \", names=[\"src\", \"dst\"], dtype=\"int32\")\n",
-    "G = nx.from_pandas_edgelist(df, source=\"src\", target=\"dst\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Define a function that can be used to run various NetworkX algorithms on the Graph created above. This can be used to compare run-times for NetworkX both without `nx-cugraph` and with `nx-cugraph` enabled.\n",
-    "\n",
-    "The following NetworkX calls will be run:\n",
-    "* [Betweenness Centrality](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html)\n",
-    "* [Breadth First Search](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_tree.html)\n",
-    "* [Louvain Community Detection](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html)\n",
-    "\n",
-    "This code does not require modification to use with nx-cugraph and can be used with NetworkX as-is even when no backends are installed."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def run_algos():\n",
-    "   print(\"\\nRunning Betweenness Centrality...\")\n",
-    "   %time nx.betweenness_centrality(G, k=10)\n",
-    "\n",
-    "   print(\"\\nRunning Breadth First Search (bfs_edges)...\")\n",
-    "   %time list(nx.bfs_edges(G, source=1))  # yields individual edges, use list() to force the full computation\n",
-    "\n",
-    "   print(\"\\nRunning Louvain...\")\n",
-    "   %time nx.community.louvain_communities(G, threshold=1e-04)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## NetworkX (no backend) Benchmark Runs\n",
-    "**_NOTE: NetworkX benchmarks without a backend for the graph used in this notebook can take very long time.  Using a Intel(R) Xeon(R) Gold 6128 CPU @ 3.40GHz with 45GB of memory, the three algo runs took approximately 50 minutes._**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Running Betweenness Centrality...\n",
-      "CPU times: user 7min 47s, sys: 5.61 s, total: 7min 53s\n",
-      "Wall time: 7min 52s\n",
-      "\n",
-      "Running Breadth First Search (bfs_edges)...\n",
-      "CPU times: user 28.9 s, sys: 336 ms, total: 29.2 s\n",
-      "Wall time: 29.1 s\n",
-      "\n",
-      "Running Louvain...\n",
-      "CPU times: user 42min 46s, sys: 4.8 s, total: 42min 51s\n",
-      "Wall time: 42min 50s\n"
-     ]
-    }
-   ],
-   "source": [
-    "run_algos()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## NetworkX with `nx-cugraph` Benchmark Runs\n",
-    "Use the `nx.config` API introduced in ([NetworkX 3.3](https://networkx.org/documentation/stable/reference/backends.html#networkx.utils.configs.NetworkXConfig)) to configure NetworkX to use nx-cugraph.  Both options used below can also be set using environment variables."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set the prioritized list of backends to automatically try. If none of the backends in the list\n",
-    "# support the algorithm, NetworkX will use the default implementation).\n",
-    "#\n",
-    "# This can also be set using the environment variable NETWORKX_BACKEND_PRIORITY which accepts a\n",
-    "# comma-separated list.\n",
-    "nx.config.backend_priority = [\"cugraph\"]  # Try the \"cugraph\" (nx-cugraph) backend first, then\n",
-    "                                          # fall back to NetworkX\n",
-    "#nx.config.backend_priority = []          # Do not use any backends\n",
-    "\n",
-    "# Enable caching of graph conversions. When set to False (the default) nx-cugraph will convert\n",
-    "# the CPU-based NetworkX graph object to a nx-cugraph GPU-based graph object each time an algorithm\n",
-    "# is run. When True, the conversion will happen once and be saved for future use *if* the graph has\n",
-    "# not been modified via a supported method such as G.add_edge(u, v, weight=val)\n",
-    "#\n",
-    "# This can also be set using the environment variable NETWORKX_CACHE_CONVERTED_GRAPHS\n",
-    "nx.config.cache_converted_graphs = True\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Note the warning message NetworkX generates to remind us a cached graph should not be manually mutated. This is shown because caching was enabled, and the initial call resulted in a cached graph conversion for use with subsequent nx-cugraph calls.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Running Betweenness Centrality...\n",
-      "CPU times: user 17.9 s, sys: 1.5 s, total: 19.4 s\n",
-      "Wall time: 19.1 s\n",
-      "\n",
-      "Running Breadth First Search (bfs_edges)...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/lib/python3.10/site-packages/networkx/utils/backends.py:1101: UserWarning: Using cached graph for 'cugraph' backend in call to bfs_edges.\n",
-      "\n",
-      "For the cache to be consistent (i.e., correct), the input graph must not have been manually mutated since the cached graph was created. Examples of manually mutating the graph data structures resulting in an inconsistent cache include:\n",
-      "\n",
-      "    >>> G[u][v][key] = val\n",
-      "\n",
-      "and\n",
-      "\n",
-      "    >>> for u, v, d in G.edges(data=True):\n",
-      "    ...     d[key] = val\n",
-      "\n",
-      "Using methods such as `G.add_edge(u, v, weight=val)` will correctly clear the cache to keep it consistent. You may also use `G.__networkx_cache__.clear()` to manually clear the cache, or set `G.__networkx_cache__` to None to disable caching for G. Enable or disable caching via `nx.config.cache_converted_graphs` config.\n",
-      "  warnings.warn(warning_message)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 50.5 s, sys: 589 ms, total: 51 s\n",
-      "Wall time: 50.7 s\n",
-      "\n",
-      "Running Louvain...\n",
-      "CPU times: user 27.4 s, sys: 3.36 s, total: 30.7 s\n",
-      "Wall time: 30.6 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "run_algos()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Betweenness Centrality call above resulted in a conversion from a NetworkX Graph to a nx-cugraph Graph due to it being the first to use nx-cugraph. However, since caching was enabled, a second call will show the run-time for Betweenness Centrality without the need to convert the graph."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Running Betweenness Centrality (again)...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/lib/python3.10/site-packages/networkx/utils/backends.py:1128: UserWarning: Using cached graph for 'cugraph' backend in call to betweenness_centrality.\n",
-      "\n",
-      "For the cache to be consistent (i.e., correct), the input graph must not have been manually mutated since the cached graph was created. Examples of manually mutating the graph data structures resulting in an inconsistent cache include:\n",
-      "\n",
-      "    >>> G[u][v][key] = val\n",
-      "\n",
-      "and\n",
-      "\n",
-      "    >>> for u, v, d in G.edges(data=True):\n",
-      "    ...     d[key] = val\n",
-      "\n",
-      "Using methods such as `G.add_edge(u, v, weight=val)` will correctly clear the cache to keep it consistent. You may also use `G.__networkx_cache__.clear()` to manually clear the cache, or set `G.__networkx_cache__` to None to disable caching for G. Enable or disable caching via `nx.config.cache_converted_graphs` config.\n",
-      "  warnings.warn(warning_message)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 1.84 s, sys: 312 ms, total: 2.15 s\n",
-      "Wall time: 2.12 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"\\nRunning Betweenness Centrality (again)...\")\n",
-    "%time result = nx.betweenness_centrality(G, k=10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "___\n",
-    "Each user is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
-    "\n",
-    "Information on the U.S. Patent Citation Network dataset used in this notebook is as follows:\n",
-    "Authors: Jure Leskovec and Andrej Krevl\n",
-    "Title: SNAP Datasets, Stanford Large Network Dataset Collection\n",
-    "URL: http://snap.stanford.edu/data\n",
-    "Date: June 2014 \n",
-    "___\n",
-    "Copyright (c) 2024, NVIDIA CORPORATION.\n",
-    "\n",
-    "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
-    "\n",
-    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n",
-    "___"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/demo/accelerating_networkx.ipynb b/notebooks/demo/accelerating_networkx.ipynb
deleted file mode 100644
index 1a6c6cfb3f6..00000000000
--- a/notebooks/demo/accelerating_networkx.ipynb
+++ /dev/null
@@ -1,614 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "R2cpVp2WdOsp"
-      },
-      "source": [
-        "# NetworkX - Easy Graph Analytics\n",
-        "\n",
-        "NetworkX is the most popular library for graph analytics available in Python, or quite possibly any language. To illustrate this, NetworkX was downloaded more than 71 million times in September of 2024 alone, which is roughly 71 times more than the next most popular graph analytics library! [*](https://en.wikipedia.org/wiki/NetworkX) NetworkX has earned this popularity from its very easy-to-use API, the wealth of documentation and examples available, the large (and friendly) community behind it, and its easy installation which requires nothing more than Python.\n",
-        "\n",
-        "However, NetworkX users are familiar with the tradeoff that comes with those benefits. The pure-Python implementation often results in poor performance when graph data starts to reach larger scales, limiting the usefulness of the library for many real-world problems.\n",
-        "\n",
-        "# Accelerated NetworkX - Easy (and fast!) Graph Analytics\n",
-        "\n",
-        "To address the performance problem, NetworkX 3.0 introduced a mechanism to dispatch algorithm calls to alternate implementations. The NetworkX Python API remains the same but NetworkX will use more capable algorithm implementations provided by one or more backends. This approach means users don't have to give up NetworkX -or even change their code- in order to take advantage of GPU performance."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xkg10FrNThrK"
-      },
-      "source": [
-        "# Let's Get the Environment Setup\n",
-        "This notebook will demonstrate NetworkX both with and without GPU acceleration provided by the `nx-cugraph` backend.\n",
-        "\n",
-        "`nx-cugraph` is available as a package installable using `pip`, `conda`, and [from source](https://github.com/rapidsai/nx-cugraph).  Before importing `networkx`, lets install `nx-cugraph` so it can be registered as an available backend by NetworkX when needed.  We'll use `pip` to install.\n",
-        "\n",
-        "NOTES:\n",
-        "* `nx-cugraph` requires a compatible NVIDIA GPU, NVIDIA CUDA and associated drivers, and a supported OS. Details about these and other installation prerequisites can be seen [here](https://docs.rapids.ai/install#system-req).\n",
-        "* The `nx-cugraph` package is currently hosted by NVIDIA and therefore the `--extra-index-url` option must be used.\n",
-        "* `nx-cugraph` is supported on specific 11.x and 12.x CUDA versions, and the major version number must be known in order to install the correct build (this is determined automatically when using `conda`).\n",
-        "\n",
-        "To find the CUDA major version on your system, run the following command:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NMFwzc1I95BS"
-      },
-      "outputs": [],
-      "source": [
-        "!nvcc --version"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "i91Yj-yZ-nGS"
-      },
-      "source": [
-        "From the above output we can see we're using CUDA 12.x so we'll be installing `nx-cugraph-cu12`. If we were using CUDA 11.x, the package name would be `nx-cugraph-cu11`. We'll also be adding `https://pypi.nvidia.com` as an `--extra-index-url`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mYYN9EpnWphu"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install nx-cugraph-cu12 --extra-index-url=https://pypi.nvidia.com"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0h1K-7tI_AZH"
-      },
-      "source": [
-        "Of course, we'll also be using `networkx`, which is already provided in the Colab environment. This notebook will be using features added in version 3.3, so we'll import it here to verify we have a compatible version."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "YTV0ZTME2tV6"
-      },
-      "outputs": [],
-      "source": [
-        "import networkx as nx\n",
-        "nx.__version__"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UiZKOa3WC7be"
-      },
-      "source": [
-        "# Let's Start with Something Simple\n",
-        "\n",
-        "To begin, we'll compare NetworkX results without a backend to results of the same algorithm using the `nx-cugraph` backend on a small graph.  `nx.karate_club_graph()` returns an instance of the famous example graph consisting of 34 nodes and 78 edges from Zachary's paper, described [here](https://en.wikipedia.org/wiki/Zachary%27s_karate_club)."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "3atL3tI0frYm"
-      },
-      "source": [
-        "## Betweenness Centrality\n",
-        "[Betweenness Centrality](https://en.wikipedia.org/wiki/Betweenness_centrality) is a graph algorithm that computes a centrality score for each node (`v`) based on how many of the shortest paths between pairs of nodes in the graph pass through `v`. A higher centrality score represents a node that \"connects\" other nodes in a network more than that of a node with a lower score.\n",
-        "\n",
-        "First, let's create a NetworkX Graph instance of the the Karate Club graph and inspect it."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JSw7EZ46-kRu"
-      },
-      "outputs": [],
-      "source": [
-        "G = nx.karate_club_graph()\n",
-        "G.number_of_nodes(), G.number_of_edges()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_-E17u2gKgbC"
-      },
-      "source": [
-        "Next, let's run betweenness centrality and save the results.  Because the Karate Club graph is so small, this should not take long."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qjxXXKJhKQ4s"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "nx_bc_results = nx.betweenness_centrality(G)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ClrR3z9XMfLr"
-      },
-      "source": [
-        "Now, let's run the same algorithm on the same data using the `nx-cugraph` backend.\n",
-        "\n",
-        "There are several ways to instruct NetworkX to use a particular backend instead of the default implementation. Here, we will use the `config` API, which was added in NetworkX version 3.3.\n",
-        "\n",
-        "The following two lines set the backend to \"cugraph\" and enable graph conversion caching.\n",
-        "\n",
-        "Some notes:\n",
-        "* The standard convention for NetworkX backends is to name the package with a `nx-` prefix to denote that these are packages intended to be used with NetworkX, but the `nx-` prefix is not included when referring to them in NetworkX API calls. Here, `nx-cugraph` is the name of the backend package, and `\"cugraph\"` is the name NetworkX will use to refer to it.\n",
-        "* NetworkX can use multiple backends! `nx.config.backend_priority` is a list that can contain several backends, ordered based on priority. If a backend in the list cannot run a particular algorithm (either because it isn't supported in the backend, the algorithm doesn't support a particular option, or some other reason), NetworkX will try the next backend in the list. If no specified backend is able to run the algorithm, NetworkX will fall back to the default implementation.\n",
-        "* Many backends have their own data structures for representing an input graph, often optimized for that backend's implementation. Prior to running a backend algorithm, NetworkX will have the backend convert the standard NetworkX Graph instance to the backend-specific type. This conversion can be expensive, and rather than repeat it as part of each algorithm call, NetworkX can cache the conversion so it can be skipped on future calls if the graph doesn't change. This caching can save significant time and improve overall performance."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "oFHwNqqsNsqS"
-      },
-      "outputs": [],
-      "source": [
-        "nx.config.backend_priority=[\"cugraph\"]  # NETWORKX_BACKEND_PRIORITY=cugraph\n",
-        "nx.config.cache_converted_graphs=True   # NETWORKX_CACHE_CONVERTED_GRAPHS=True"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HrUeWRRQRzFP"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "nxcg_bc_results = nx.betweenness_centrality(G)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "z1hxut3GTj5A"
-      },
-      "source": [
-        "You may have noticed that using the `nx-cugraph` backend resulted in a slightly slower execution time. This is not surprising when working with a graph this small, since the overhead of converting the graph for the first time and launching the algorithm kernel on the GPU is actually significantly more than the computation time itself.  We'll see later that this overhead is negligible when compared to the time saved when running on a GPU for larger graphs.\n",
-        "\n",
-        "Since we've enabled graph conversion caching, we can see that if we re-run the same call the execution time is noticeably shorter."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7a0XvpUOr9Ju"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "nxcg_bc_results = nx.betweenness_centrality(G)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ppjE5J5RscOe"
-      },
-      "source": [
-        "Notice the warning above about using the cache. This will only be raised **once** per graph instance (it can also be easily disabled), but its purpose is to point out that the cache should not be used if the Graph object will have its attribute dictionary modified directly. In this case and many others, we won't be modifying the dictionaries directly. Instead, we will use APIs such as `nx.set_node_attributes` which properly clear the cache, so it's safe for us to use the cache. Because of that, we'll disable the warning so we don't see it on other graphs in this session."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Namb5JLvwS-q"
-      },
-      "outputs": [],
-      "source": [
-        "import warnings\n",
-        "warnings.filterwarnings(\"ignore\", message=\"Using cached graph for 'cugraph' backend\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BzGAphcILFsT"
-      },
-      "source": [
-        "Smaller graphs are also easy to visualize with NetworkX's plotting utilities. The flexibility of NetworkX's `Graph` instances make it trivial to add the betweenness centrality scores back to the graph object as node attributes. This will allow us to use those values for the visualization.\n",
-        "\n",
-        "In this case, we'll create new attributes for each node called \"nx_bc\" for the default NetworkX results, and \"nxcg_bc\" for the nx-cugraph results. We'll use those values to assign the color for each node and plot two graphs side-by-side. This will make it easy to visually validate that the nodes with the higher centrality scores for both implementations match and do indeed appear to be more \"central\" to other nodes."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "1coV6ZfcUoqI"
-      },
-      "outputs": [],
-      "source": [
-        "nx.set_node_attributes(G, nx_bc_results, \"nx_bc\")\n",
-        "nx.set_node_attributes(G, nxcg_bc_results, \"nxcg_bc\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Sba2iYJgLoN2"
-      },
-      "outputs": [],
-      "source": [
-        "# Configure plot size and layout/position for each node\n",
-        "import matplotlib.pyplot as plt\n",
-        "plt.rcParams['figure.figsize'] = [12, 8]\n",
-        "pos = nx.spring_layout(G)\n",
-        "\n",
-        "# Assign colors for each set of betweenness centrality results\n",
-        "nx_colors = [G.nodes[n][\"nx_bc\"] for n in G.nodes()]\n",
-        "nxcg_colors = [G.nodes[n][\"nxcg_bc\"] for n in G.nodes()]\n",
-        "\n",
-        "# Plot the graph and color each node corresponding to NetworkX betweenness centrality values\n",
-        "plt.subplot(1, 2, 1)\n",
-        "nx.draw(G, pos=pos, with_labels=True, node_color=nx_colors)\n",
-        "\n",
-        "# Plot the graph and color each node corresponding to nx-cugraph betweenness centrality values\n",
-        "plt.subplot(1, 2, 2)\n",
-        "nx.draw(G, pos=pos, with_labels=True, node_color=nxcg_colors)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dJXH4Zn5VNSg"
-      },
-      "source": [
-        "As we can see, the same two nodes (`0` and `33`) are the two most central in both graphs, followed by `2`, `31`, and `32`.\n",
-        "\n",
-        "## PageRank\n",
-        "Another popular algorithm is [PageRank](https://en.wikipedia.org/wiki/PageRank). PageRank also assigns scores to each node, but these scores are based on analyzing links to each node to determine relative \"importance\" within the graph.\n",
-        "\n",
-        "Let's update the config to use the default NetworkX implementation and run `nx.pagerank`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9CdYNk62E1v_"
-      },
-      "outputs": [],
-      "source": [
-        "nx.config.backend_priority=[]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Jo39YxVmYolq"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "nx_pr_results = nx.pagerank(G)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sV6dM8ToZDiC"
-      },
-      "source": [
-        "We could set `nx.config.backend_priority` again to list `\"cugraph\"` as the backend, but let's instead show how the `backend` kwarg can be used to override the priority list and force a specific backend to be used."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "oMSvQVGKY0rn"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "nxcg_pr_results = nx.pagerank(G, backend=\"cugraph\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZGux_8xFZneI"
-      },
-      "source": [
-        "In this example, instead of plotting the graph to show that the results are identical, we can compare them directly using the saved values from both runs."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RcmtdFy4Zw7p"
-      },
-      "outputs": [],
-      "source": [
-        "sorted(nx_pr_results) == sorted(nxcg_pr_results)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mefjUEAnZ4pq"
-      },
-      "source": [
-        "# Working with Bigger Data"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "yLY-yl6PuNYo"
-      },
-      "source": [
-        "Now we'll look at a larger dataset from https://snap.stanford.edu/data/cit-Patents.html which contains citations across different U.S. patents granted from January 1, 1963 to December 30, 1999. The dataset represents 16.5M citations (edges) between 3.77M patents (nodes).\n",
-        "\n",
-        "This will demonstrate that data of this size starts to push the limits of the default pure-Python NetworkX implementation."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lyYF0LbtFwjh"
-      },
-      "outputs": [],
-      "source": [
-        "# The locale encoding may have been modified from the plots above, reset here to run shell commands\n",
-        "import locale\n",
-        "locale.getpreferredencoding = lambda: \"UTF-8\"\n",
-        "!wget https://data.rapids.ai/cugraph/datasets/cit-Patents.csv  # Skip if cit-Patents.csv already exists.\n",
-        "# !wget https://snap.stanford.edu/data/cit-Patents.txt.gz  # Skip if cit-Patents.txt.gz already exists."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kjGINYphQSQ2"
-      },
-      "outputs": [],
-      "source": [
-        "%load_ext cudf.pandas\n",
-        "import pandas as pd"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "iV4DieGZOalc"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "df = pd.read_csv(\"cit-Patents.csv\",\n",
-        "                sep=\" \",\n",
-        "                names=[\"src\", \"dst\"],\n",
-        "                dtype=\"int32\",\n",
-        ")\n",
-        "# df = pd.read_csv(\"cit-Patents.txt.gz\",\n",
-        "#                  compression=\"gzip\",\n",
-        "#                  skiprows=4,\n",
-        "#                  sep=\"\\t\",\n",
-        "#                  names=[\"src\", \"dst\"],\n",
-        "#                  dtype=\"int32\",\n",
-        "# )"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "PREA67u4eKat"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "G = nx.from_pandas_edgelist(df, source=\"src\", target=\"dst\")\n",
-        "G.number_of_nodes(), G.number_of_edges()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "NcsUxBqpu4zY"
-      },
-      "source": [
-        "By default, `nx.betweenness_centrality` will perform an all-pairs shortest path analysis when determining the centrality scores for each node. However, due to the much larger size of this graph, determining the shortest path for all pairs of nodes in the graph is not feasible. Instead, we'll use the parameter `k` to limit the number of shortest path computations used for determining the centrality scores, at the expense of accuracy. As we'll see when using a dataset this size with `nx.betweenness_centrality`, we have to limit `k` to `1` which is not practical but is sufficient here for demonstration purposes (since anything larger than `1` will result in many minutes of execution time)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gNDWbj3kAk3j"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "bc_results = nx.betweenness_centrality(G, k=1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "NB8xmxMd1PlX"
-      },
-      "source": [
-        "Now we'll configure NetworkX to use the `nx-cugraph` backend (again, using the name convention that drops the package name's `nx-` prefix) and run the same call. Because this is a Graph that `nx-cugraph` hasn't seen before, the runtime will include the time to convert and cache a GPU-based graph."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xUYNG1xhvbWc"
-      },
-      "outputs": [],
-      "source": [
-        "nx.config.backend_priority = [\"cugraph\"]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cmK8ZuQGvfPo"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "bc_results = nx.betweenness_centrality(G, k=1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vdHb1YXP15TZ"
-      },
-      "source": [
-        "Let's run betweenness centrality again, now with a more useful number of samples by setting `k=100`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fKjIrzL-vrGS"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "bc_results = nx.betweenness_centrality(G, k=100)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QeMcrAX2HZSM"
-      },
-      "source": [
-        "Let's also run pagerank on the same dataset to compare."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gR8ID6ekHgHt"
-      },
-      "outputs": [],
-      "source": [
-        "nx.config.backend_priority = []"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rTFuvX5wb_c1"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "nx_pr_results = nx.pagerank(G)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "8sJx9aeJV9hv"
-      },
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "nxcg_pr_results = nx.pagerank(G, backend=\"cugraph\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "wGOVQ6ZyY4Ih"
-      },
-      "outputs": [],
-      "source": [
-        "sorted(nx_pr_results) == sorted(nxcg_pr_results)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "k2DfAaZaDIBj"
-      },
-      "source": [
-        "---\n",
-        "<i>\n",
-        "Information on the U.S. Patent Citation Network dataset used in this notebook is as follows:\n",
-        "<br>Authors: Jure Leskovec and Andrej Krevl\n",
-        "<br>Title: SNAP Datasets, Stanford Large Network Dataset Collection\n",
-        "<br>URL: http://snap.stanford.edu/data\n",
-        "<br>Date: June 2014\n",
-        "</i>\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.12.4"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/notebooks/demo/nx_cugraph_demo.ipynb b/notebooks/demo/nx_cugraph_demo.ipynb
deleted file mode 100644
index f1ce80aa188..00000000000
--- a/notebooks/demo/nx_cugraph_demo.ipynb
+++ /dev/null
@@ -1,672 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# `nx-cugraph`: a NetworkX backend that provides GPU acceleration with RAPIDS cuGraph"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This notebook will demonstrate the `nx-cugraph` NetworkX backend using the NetworkX betweenness_centrality algorithm.\n",
-    "\n",
-    "## Background\n",
-    "Networkx version 3.0 introduced a dispatching mechanism that allows users to configure NetworkX to dispatch various algorithms to third-party backends. Backends can provide different implementations of graph algorithms, allowing users to take advantage of capabilities not available in NetworkX. `nx-cugraph` is a NetworkX backend provided by the [RAPIDS](https://rapids.ai) cuGraph project that adds GPU acceleration to greatly improve performance.\n",
-    "\n",
-    "## System Requirements\n",
-    "Using `nx-cugraph` with this notebook requires the following: \n",
-    "- NVIDIA GPU, Pascal architecture or later\n",
-    "- CUDA 11.2, 11.4, 11.5, 11.8, or 12.0\n",
-    "- Python versions 3.10, 3.11, or 3.12\n",
-    "- NetworkX >= version 3.2\n",
-    "  - _NetworkX 3.0 supports dispatching and is compatible with `nx-cugraph`, but this notebook will demonstrate features added in 3.2_\n",
-    "  - At the time of this writing, NetworkX 3.2 is only available from source and can be installed by following the [development version install instructions](https://github.com/networkx/networkx/blob/main/INSTALL.rst#install-the-development-version).\n",
-    "- Pandas\n",
-    "\n",
-    "More details about system requirements can be found in the [RAPIDS System Requirements documentation](https://docs.rapids.ai/install#system-req)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Installation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Assuming NetworkX >= 3.2 has been installed using the [development version install instructions](https://github.com/networkx/networkx/blob/main/INSTALL.rst#install-the-development-version), `nx-cugraph` can be installed using either `conda` or `pip`.  \n",
-    "\n",
-    "#### conda\n",
-    "```\n",
-    "conda install -c rapidsai-nightly -c conda-forge -c nvidia nx-cugraph\n",
-    "```\n",
-    "#### pip\n",
-    "```\n",
-    "python -m pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com\n",
-    "```\n",
-    "#### _Notes:_\n",
-    " * nightly wheel builds will not be available until the 23.12 release, therefore the index URL for the stable release version is being used in the pip install command above.\n",
-    " * Additional information relevant to installing any RAPIDS package can be found [here](https://rapids.ai/#quick-start).\n",
-    " * If you installed any of the packages described here since running this notebook, you may need to restart the kernel to have them visible to this notebook."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Notebook Helper Functions\n",
-    "\n",
-    "A few helper functions will be defined here that will be used in order to help keep this notebook easy to read."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "def reimport_networkx():\n",
-    "    \"\"\"\n",
-    "    Re-imports networkx for demonstrating different backend configuration\n",
-    "    options applied at import-time. This is only needed for demonstration\n",
-    "    purposes since other mechanisms are available for runtime configuration.\n",
-    "    \"\"\"\n",
-    "    # Using importlib.reload(networkx) has several caveats (described here:\n",
-    "    # https://docs.python.org/3/library/imp.html?highlight=reload#imp.reload)\n",
-    "    # which result in backend configuration not being re-applied correctly.\n",
-    "    # Instead, manually remove all modules and re-import\n",
-    "    nx_mods = [m for m in sys.modules.keys()\n",
-    "               if (m.startswith(\"networkx\") or m.startswith(\"nx_cugraph\"))]\n",
-    "    for m in nx_mods:\n",
-    "        sys.modules.pop(m)\n",
-    "    import networkx\n",
-    "    return networkx\n",
-    "\n",
-    "\n",
-    "from pathlib import Path\n",
-    "import requests\n",
-    "import gzip\n",
-    "import pandas as pd\n",
-    "def create_cit_patents_graph(verbose=True):\n",
-    "    \"\"\"\n",
-    "    Downloads the cit-Patents dataset (if not previously downloaded), reads\n",
-    "    it, and creates a nx.DiGraph from it and returns it.\n",
-    "    cit-Patents is described here:\n",
-    "    https://snap.stanford.edu/data/cit-Patents.html\n",
-    "    \"\"\"\n",
-    "    url = \"https://snap.stanford.edu/data/cit-Patents.txt.gz\"\n",
-    "    gz_file_name = Path(url.split(\"/\")[-1])\n",
-    "    csv_file_name = Path(gz_file_name.stem)\n",
-    "    if csv_file_name.exists():\n",
-    "        if verbose: print(f\"{csv_file_name} already exists, not downloading.\")\n",
-    "    else:\n",
-    "        if verbose: print(f\"downloading {url}...\", end=\"\", flush=True)\n",
-    "        req = requests.get(url)\n",
-    "        open(gz_file_name, \"wb\").write(req.content)\n",
-    "        if verbose: print(\"done\")\n",
-    "        if verbose: print(f\"unzipping {gz_file_name}...\", end=\"\", flush=True)\n",
-    "        with gzip.open(gz_file_name, \"rb\") as gz_in:\n",
-    "            with open(csv_file_name, \"wb\") as txt_out:\n",
-    "                txt_out.write(gz_in.read())\n",
-    "        if verbose: print(\"done\")\n",
-    "\n",
-    "    if verbose: print(\"reading csv to dataframe...\", end=\"\", flush=True)\n",
-    "    pandas_edgelist = pd.read_csv(\n",
-    "        csv_file_name.name,\n",
-    "        skiprows=4,\n",
-    "        delimiter=\"\\t\",\n",
-    "        names=[\"src\", \"dst\"],\n",
-    "        dtype={\"src\":\"int32\", \"dst\":\"int32\"},\n",
-    "    )\n",
-    "    if verbose: print(\"done\")\n",
-    "    if verbose: print(\"creating NX graph from dataframe...\", end=\"\", flush=True)\n",
-    "    G = nx.from_pandas_edgelist(\n",
-    "        pandas_edgelist, source=\"src\", target=\"dst\", create_using=nx.DiGraph\n",
-    "    )\n",
-    "    if verbose: print(\"done\")\n",
-    "    return G"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Running `betweenness_centrality`\n",
-    "Let's start by running `betweenness_centrality` on the Karate Club graph using the default NetworkX implementation."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "### Zachary's Karate Club\n",
-    "\n",
-    "Zachary's Karate Club is a small dataset consisting of 34 nodes and 78 edges which represent the friendships between members of a karate club. This dataset is small enough to make comparing results between NetworkX and `nx-cugraph` easy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import networkx as nx\n",
-    "karate_club_graph = nx.karate_club_graph()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "Having NetworkX compute the `betweenness_centrality` values for each node on this graph is quick and easy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2.51 ms ± 1.02 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit global karate_nx_bc_results\n",
-    "karate_nx_bc_results = nx.betweenness_centrality(karate_club_graph)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "### Automatic GPU acceleration\n",
-    "When `nx-cugraph` is installed, NetworkX will detect it on import and make it available as a backend for APIs supported by that backend.  However, NetworkX does not assume the user always wants to use a particular backend, and instead looks at various configuration mechanisms in place for users to specify how NetworkX should use installed backends. Since NetworkX was not configured to use a backend for the above `betweenness_centrality` call, it used the default implementation provided by NetworkX.\n",
-    "\n",
-    "The first configuration mechanism to be demonstrated below is the `NETWORKX_AUTOMATIC_BACKENDS` environment variable.  This environment variable directs NetworkX to use the backend specified everywhere it's supported and does not require the user to modify any of their existing NetworkX code.\n",
-    "\n",
-    "To use it, a user sets `NETWORKX_AUTOMATIC_BACKENDS` in their shell to the backend they'd like to use.  If a user has more than one backend installed, the environment variable can also accept a comma-separated list of backends, ordered by priority in which NetworkX should use them, where the first backend that supports a particular API call will be used.  For example:\n",
-    "```\n",
-    "bash> export NETWORKX_AUTOMATIC_BACKENDS=cugraph\n",
-    "bash> python my_nx_app.py  # uses nx-cugraph wherever possible, then falls back to default implementation where it's not.\n",
-    "```\n",
-    "or in the case of multiple backends installed\n",
-    "```\n",
-    "bash> export NETWORKX_AUTOMATIC_BACKENDS=cugraph,graphblas\n",
-    "bash> python my_nx_app.py  # uses nx-cugraph if possible, then nx-graphblas if possible, then default implementation.\n",
-    "```\n",
-    "\n",
-    "NetworkX looks at the environment variable and the installed backends at import time, and will not re-examine the environment after that.  Because `networkx` was already imported in this notebook, the `reimport_nx()` utility will be called after the `os.environ` dictionary is updated to simulate an environment variable being set in the shell.\n",
-    "\n",
-    "**Please note, this is only needed for demonstration purposes to compare runs both with and without fully-automatic backend use enabled.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.environ[\"NETWORKX_AUTOMATIC_BACKENDS\"] = \"cugraph\"\n",
-    "nx = reimport_networkx()\n",
-    "# reimporting nx requires reinstantiating Graphs since python considers\n",
-    "# types from the prior nx import != types from the reimported nx\n",
-    "karate_club_graph = nx.karate_club_graph()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "Once the environment is updated, re-running the same `betweenness_centrality` call on the same graph requires no code changes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "43.9 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit global karate_cg_bc_results\n",
-    "karate_cg_bc_results = nx.betweenness_centrality(karate_club_graph)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "We may see that the same computation actually took *longer* using `nx-cugraph`. This is not too surprising given how small the graph is, since there's a small amount of overhead to copy data to and from the GPU which becomes more obvious on very small graphs.  We'll see with a larger graph how this overhead becomes negligible."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "### Results Comparison\n",
-    "\n",
-    "Let's examine the results of each run to see how they compare.  \n",
-    "The `betweenness_centrality` results are a dictionary mapping vertex IDs to betweenness_centrality scores.  The score itself is usually not as important as the relative rank of each vertex ID (e.g. vertex A is ranked higher than vertex B in both sets of results)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "NX: (0, 0.437635), CG: (0, 0.437635)\n",
-      "NX: (33, 0.304075), CG: (33, 0.304075)\n",
-      "NX: (32, 0.145247), CG: (32, 0.145247)\n",
-      "NX: (2, 0.143657), CG: (2, 0.143657)\n",
-      "NX: (31, 0.138276), CG: (31, 0.138276)\n",
-      "NX: (8, 0.055927), CG: (8, 0.055927)\n",
-      "NX: (1, 0.053937), CG: (1, 0.053937)\n",
-      "NX: (13, 0.045863), CG: (13, 0.045863)\n",
-      "NX: (19, 0.032475), CG: (19, 0.032475)\n",
-      "NX: (5, 0.029987), CG: (5, 0.029987)\n",
-      "NX: (6, 0.029987), CG: (6, 0.029987)\n",
-      "NX: (27, 0.022333), CG: (27, 0.022333)\n",
-      "NX: (23, 0.017614), CG: (23, 0.017614)\n",
-      "NX: (30, 0.014412), CG: (30, 0.014412)\n",
-      "NX: (3, 0.011909), CG: (3, 0.011909)\n",
-      "NX: (25, 0.003840), CG: (25, 0.003840)\n",
-      "NX: (29, 0.002922), CG: (29, 0.002922)\n",
-      "NX: (24, 0.002210), CG: (24, 0.002210)\n",
-      "NX: (28, 0.001795), CG: (28, 0.001795)\n",
-      "NX: (9, 0.000848), CG: (9, 0.000848)\n",
-      "NX: (4, 0.000631), CG: (4, 0.000631)\n",
-      "NX: (10, 0.000631), CG: (10, 0.000631)\n",
-      "NX: (7, 0.000000), CG: (7, 0.000000)\n",
-      "NX: (11, 0.000000), CG: (11, 0.000000)\n",
-      "NX: (12, 0.000000), CG: (12, 0.000000)\n",
-      "NX: (14, 0.000000), CG: (14, 0.000000)\n",
-      "NX: (15, 0.000000), CG: (15, 0.000000)\n",
-      "NX: (16, 0.000000), CG: (16, 0.000000)\n",
-      "NX: (17, 0.000000), CG: (17, 0.000000)\n",
-      "NX: (18, 0.000000), CG: (18, 0.000000)\n",
-      "NX: (20, 0.000000), CG: (20, 0.000000)\n",
-      "NX: (21, 0.000000), CG: (21, 0.000000)\n",
-      "NX: (22, 0.000000), CG: (22, 0.000000)\n",
-      "NX: (26, 0.000000), CG: (26, 0.000000)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# The lists contain tuples of (vertex ID, betweenness_centrality score),\n",
-    "# sorted based on the score.\n",
-    "nx_sorted = sorted(karate_nx_bc_results.items(), key=lambda t:t[1], reverse=True)\n",
-    "cg_sorted = sorted(karate_cg_bc_results.items(), key=lambda t:t[1], reverse=True)\n",
-    "\n",
-    "for i in range(len(nx_sorted)):\n",
-    "    print(\"NX: (%d, %.6f), CG: (%d, %.6f)\" % (nx_sorted[i] + cg_sorted[i]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "Here we can see that the results match exactly as expected.  \n",
-    "\n",
-    "For larger graphs, results are harder to compare given that `betweenness_centrality` is an approximation algorithm influenced by the random selection of paths used to compute the betweenness_centrality score of each vertex.  The argument `k` is used for limiting the number of paths used in the computation, since using every path for every vertex would be prohibitively expensive for large graphs.  For small graphs, `k` need not be specified, which allows `betweenness_centrality` to use all paths for all vertices and makes for an easier comparison."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "### `betweenness_centrality` on larger graphs - The U.S. Patent Citation Network<sup>1</sup>\n",
-    "\n",
-    "The U.S. Patent Citation Network dataset is much larger with over 3.7M nodes and over 16.5M edges and demonstrates how `nx-cugraph` enables NetworkX to run `betweenness_centrality` on graphs this large (and larger) in seconds instead of minutes.\n",
-    "\n",
-    "#### NetworkX default implementation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "downloading https://snap.stanford.edu/data/cit-Patents.txt.gz...done\n",
-      "unzipping cit-Patents.txt.gz...done\n",
-      "reading csv to dataframe...done\n",
-      "creating NX graph from dataframe...done\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "# Unset NETWORKX_AUTOMATIC_BACKENDS so the default NetworkX implementation is used\n",
-    "os.environ.pop(\"NETWORKX_AUTOMATIC_BACKENDS\", None)\n",
-    "nx = reimport_networkx()\n",
-    "# Create the cit-Patents graph - this will also download the dataset if not previously downloaded\n",
-    "cit_patents_graph = create_cit_patents_graph()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Since this is a large graph, a k value must be set so the computation returns in a reasonable time\n",
-    "k = 40"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "Because this run will take time, `%%timeit` is restricted to a single pass.\n",
-    "\n",
-    "*NOTE: this run may take approximately 1 minute*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1min 4s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit -r 1\n",
-    "results = nx.betweenness_centrality(cit_patents_graph, k=k)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "Something to note is that `%%timeit` disables garbage collection by default, which may not be something a user is able to do. To see a more realistic real-world run time, `gc` can be enabled."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# import and run the garbage collector upfront prior to using it in the benchmark\n",
-    "import gc\n",
-    "gc.collect()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "*NOTE: this run may take approximately 7 minutes!*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "6min 50s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit -r 1 gc.enable()\n",
-    "nx.betweenness_centrality(cit_patents_graph, k=k)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### `nx-cugraph`\n",
-    "\n",
-    "Running on a GPU using `nx-cugraph` can result in a tremendous speedup, especially when graphs reach sizes larger than a few thousand nodes or `k` values become larger to increase accuracy.\n",
-    "\n",
-    "Rather than setting the `NETWORKX_AUTOMATIC_BACKENDS` environment variable and re-importing again, this example will demonstrate the `backend=` keyword argument to explicitly direct the NetworkX dispatcher to use the `cugraph` backend."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "10.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit -r 1 gc.enable()\n",
-    "nx.betweenness_centrality(cit_patents_graph, k=k, backend=\"cugraph\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "k = 150"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "11.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit -r 1 gc.enable()\n",
-    "nx.betweenness_centrality(cit_patents_graph, k=k, backend=\"cugraph\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "For the same graph and the same `k` value, the `\"cugraph\"` backend returns results in seconds instead of minutes.  Increasing the `k` value has very little relative impact to runtime due to the high parallel processing ability of the GPU, allowing the user to get improved accuracy for virtually no additional cost."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Type-based dispatching\n",
-    "\n",
-    "NetworkX also supports automatically dispatching to backends associated with specific graph types.  This requires the user to write code for a specific backend, and therefore requires the backend to be installed, but has the advantage of ensuring a particular behavior without the potential for runtime conversions.\n",
-    "\n",
-    "To use type-based dispatching with `nx-cugraph`, the user must import the backend directly in their code to access the utilities provided to create a Graph instance specifically for the `nx-cugraph` backend."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import nx_cugraph as nxcg"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "The `from_networkx()` API will copy the data from the NetworkX graph instance to the GPU and return a new `nx-cugraph` graph instance.  By passing an explicit `nx-cugraph` graph, the NetworkX dispatcher will automatically call the `\"cugraph\"` backend (and only the `\"cugraph\"` backend) without requiring future conversions to copy data to the GPU."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7.92 s ± 2.85 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit -r 2 global nxcg_cit_patents_graph\n",
-    "nxcg_cit_patents_graph = nxcg.from_networkx(cit_patents_graph)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "3.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit -r 1 gc.enable()\n",
-    "nx.betweenness_centrality(nxcg_cit_patents_graph, k=k)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Conclusion\n",
-    "\n",
-    "This notebook demonstrated `nx-cugraph`'s support for `betweenness_centrality`.  At the time of this writing, `nx-cugraph` also provides support for `edge_netweenness_centrality` and `louvain_communities`.  Other algorithms are scheduled to be supported based on their availability in the cuGraph [pylibcugraph](https://github.com/rapidsai/cugraph/tree/branch-23.10/python/pylibcugraph/pylibcugraph) package and demand by the NetworkX community.\n",
-    "\n",
-    "#### Benchmark Results\n",
-    "The results included in this notebook were generated on a workstation with the following hardware:\n",
-    "\n",
-    "<table align=\"left\">\n",
-    "    <tr><td>CPU:</td><td>Intel(R) Xeon(R) Gold 6128 CPU @ 3.40GHz, 45GB</td></tr>\n",
-    "    <tr><td>GPU:</td><td>Quatro RTX 8000, 50GB</td></tr>\n",
-    "</table>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<sup>1</sup> Information on the U.S. Patent Citation Network dataset used in this notebook is as follows:\n",
-    "<table align=\"left\">\n",
-    "    <tr><td>Authors:</td><td>Jure Leskovec and Andrej Krevl</td></tr>\n",
-    "    <tr><td>Title:</td><td>SNAP Datasets, Stanford Large Network Dataset Collection</td></tr>\n",
-    "    <tr><td>URL:</td><td>http://snap.stanford.edu/data</td></tr>\n",
-    "    <tr><td>Date:</td><td>June 2014</td></tr>\n",
-    "</table>\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/python/.coveragerc b/python/.coveragerc
index 1c33570c05c..9e15f7d1acc 100644
--- a/python/.coveragerc
+++ b/python/.coveragerc
@@ -1,10 +1,8 @@
 # Configuration file for Python coverage tests
 [run]
 include = cugraph/cugraph/*
-          cugraph-pyg/cugraph_pyg/*
           cugraph-service/*
           pylibcugraph/pylibcugraph/*
 omit = cugraph/cugraph/tests/*
-       cugraph-pyg/cugraph_pyg/tests/*
        cugraph-service/tests/*
        pylibcugraph/pylibcugraph/tests/*
diff --git a/python/cugraph-dgl/LICENSE b/python/cugraph-dgl/LICENSE
deleted file mode 120000
index 30cff7403da..00000000000
--- a/python/cugraph-dgl/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-dgl/README.md b/python/cugraph-dgl/README.md
deleted file mode 100644
index 013d4fe5e2e..00000000000
--- a/python/cugraph-dgl/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# cugraph_dgl
-
-## Description
-
-[RAPIDS](https://rapids.ai) cugraph_dgl provides a duck-typed version of the [DGLGraph](https://docs.dgl.ai/api/python/dgl.DGLGraph.html#dgl.DGLGraph) class, which uses cugraph for storing graph structure and node/edge feature data.  Using cugraph as the backend allows DGL users to access a collection of GPU accelerated algorithms for graph analytics, such as centrality computation and community detection.
-
-## Conda
-
-Install and update cugraph-dgl and the required dependencies using the command:
-
-```shell
-# CUDA 11
-conda install -c rapidsai -c pytorch -c conda-forge -c nvidia -c dglteam/label/th23_cu118 cugraph-dgl
-
-# CUDA 12
-conda install -c rapidsai -c pytorch -c conda-forge -c nvidia -c dglteam/label/th23_cu121 cugraph-dgl
-```
-
-## Build from Source
-
-### Create the conda development environment
-```
-mamba env create -n cugraph_dgl_dev --file conda/cugraph_dgl_dev_11.6.yml
-```
-
-### Install  in editable mode
-```
-pip install -e .
-```
-
-### Run tests
-
-```
-pytest tests/*
-```
-
-
-## Usage
-```diff
-
-+from cugraph_dgl.convert import cugraph_storage_from_heterograph
-+cugraph_g = cugraph_storage_from_heterograph(dgl_g)
-
-sampler = dgl.dataloading.NeighborSampler(
-        [15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label'])
-
-train_dataloader = dgl.dataloading.DataLoader(
-- dgl_g,
-+ cugraph_g,
-train_idx,
-sampler,
-device=device,
-batch_size=1024,
-shuffle=True,
-drop_last=False,
-num_workers=0)
-```
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
deleted file mode 100644
index 70143b0d422..00000000000
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- dglteam/label/th23_cu118
-- conda-forge
-- nvidia
-dependencies:
-- cugraph==25.2.*,>=0.0.0a0
-- dgl>=2.4.0.cu*
-- pandas
-- pre-commit
-- pylibcugraphops==25.2.*,>=0.0.0a0
-- pytest
-- pytest-benchmark
-- pytest-cov
-- pytest-xdist
-- pytorch-cuda==11.8
-- pytorch>=2.3,<2.4.0a0
-- scipy
-- tensordict>=0.1.2
-name: cugraph_dgl_dev_cuda-118
diff --git a/python/cugraph-dgl/cugraph_dgl/VERSION b/python/cugraph-dgl/cugraph_dgl/VERSION
deleted file mode 120000
index d62dc733efd..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../../VERSION
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
deleted file mode 100644
index 58850d47fba..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-# to prevent rapids context being created when importing cugraph_dgl
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-from cugraph_dgl.graph import Graph
-from cugraph_dgl.cugraph_storage import CuGraphStorage
-from cugraph_dgl.convert import (
-    cugraph_storage_from_heterograph,
-    cugraph_dgl_graph_from_heterograph,
-)
-import cugraph_dgl.dataloading
-import cugraph_dgl.nn
-
-from cugraph_dgl._version import __git_commit__, __version__
diff --git a/python/cugraph-dgl/cugraph_dgl/_version.py b/python/cugraph-dgl/cugraph_dgl/_version.py
deleted file mode 100644
index e8adcc31430..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/_version.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import importlib.resources
-
-__version__ = (
-    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
-)
-try:
-    __git_commit__ = (
-        importlib.resources.files(__package__)
-        .joinpath("GIT_COMMIT")
-        .read_text()
-        .strip()
-    )
-except FileNotFoundError:
-    __git_commit__ = ""
-
-__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cugraph-dgl/cugraph_dgl/convert.py b/python/cugraph-dgl/cugraph_dgl/convert.py
deleted file mode 100644
index ae4b96dd391..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/convert.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl import CuGraphStorage
-from cugraph_dgl.utils.cugraph_conversion_utils import (
-    get_edges_dict_from_dgl_HeteroGraph,
-    add_ndata_from_dgl_HeteroGraph,
-    add_edata_from_dgl_HeteroGraph,
-)
-
-dgl = import_optional("dgl")
-
-
-def cugraph_storage_from_heterograph(
-    g: dgl.DGLGraph, single_gpu: bool = True
-) -> CuGraphStorage:
-    """
-    Convert DGL Graph to CuGraphStorage graph
-    """
-    num_nodes_dict = {ntype: g.num_nodes(ntype) for ntype in g.ntypes}
-    edges_dict = get_edges_dict_from_dgl_HeteroGraph(g, single_gpu)
-    gs = CuGraphStorage(
-        data_dict=edges_dict,
-        num_nodes_dict=num_nodes_dict,
-        single_gpu=single_gpu,
-        idtype=g.idtype,
-    )
-    add_ndata_from_dgl_HeteroGraph(gs, g)
-    add_edata_from_dgl_HeteroGraph(gs, g)
-    return gs
-
-
-def cugraph_dgl_graph_from_heterograph(
-    input_graph: dgl.DGLGraph,
-    single_gpu: bool = True,
-    ndata_storage: str = "torch",
-    edata_storage: str = "torch",
-    **kwargs,
-) -> cugraph_dgl.Graph:
-    """
-    Converts a DGL Graph to a cuGraph-DGL Graph.
-    """
-
-    output_graph = cugraph_dgl.Graph(
-        is_multi_gpu=(not single_gpu),
-        ndata_storage=ndata_storage,
-        edata_storage=edata_storage,
-        **kwargs,
-    )
-
-    # Calling is_homogeneous does not work here
-    if len(input_graph.ntypes) <= 1:
-        output_graph.add_nodes(
-            input_graph.num_nodes(), data=input_graph.ndata, ntype=input_graph.ntypes[0]
-        )
-    else:
-        for ntype in input_graph.ntypes:
-            data = {
-                k: v_dict[ntype]
-                for k, v_dict in input_graph.ndata.items()
-                if ntype in v_dict
-            }
-            output_graph.add_nodes(input_graph.num_nodes(ntype), data=data, ntype=ntype)
-
-    if len(input_graph.canonical_etypes) <= 1:
-        can_etype = input_graph.canonical_etypes[0]
-        src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
-        output_graph.add_edges(src_t, dst_t, input_graph.edata, etype=can_etype)
-    else:
-        for can_etype in input_graph.canonical_etypes:
-            data = {
-                k: v_dict[can_etype]
-                for k, v_dict in input_graph.edata.items()
-                if can_etype in v_dict
-            }
-
-            src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
-            output_graph.add_edges(src_t, dst_t, data=data, etype=can_etype)
-
-    return output_graph
diff --git a/python/cugraph-dgl/cugraph_dgl/cugraph_storage.py b/python/cugraph-dgl/cugraph_dgl/cugraph_storage.py
deleted file mode 100644
index 6a1b6ee32b8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/cugraph_storage.py
+++ /dev/null
@@ -1,714 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-from typing import Optional, Sequence, Tuple, Dict, Union
-from functools import cached_property
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph.gnn import FeatureStore
-from cugraph.gnn.dgl_extensions.dgl_uniform_sampler import DGLUniformSampler
-import cudf
-import dask_cudf
-import cupy as cp
-from cugraph_dgl.utils.cugraph_storage_utils import (
-    _assert_valid_canonical_etype,
-    backend_dtype_to_np_dtype_dict,
-    add_edge_ids_to_edges_dict,
-    add_node_offset_to_edges_dict,
-)
-from cugraph_dgl.utils.feature_storage import dgl_FeatureStorage
-
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-torch = import_optional("torch")
-
-
-class CuGraphStorage:
-    """
-    Duck-typed version of the DGLHeteroGraph class made for cuGraph
-    for storing graph structure and node/edge feature data.
-
-    This object is wrapper around cugraph's Multi GPU MultiGraph and returns samples
-    that conform with `DGLHeteroGraph`
-    See: https://docs.rapids.ai/api/cugraph/nightly/api_docs/cugraph_dgl.html
-    """
-
-    def __init__(
-        self,
-        data_dict: Dict[
-            Tuple[str, str, str], Union[cudf.DataFrame, dask_cudf.DataFrame]
-        ],
-        num_nodes_dict: Dict[str, int],
-        single_gpu: bool = True,
-        device_id: int = 0,
-        idtype=None if isinstance(F, MissingModule) else F.int64,
-    ):
-        """
-        Constructor for creating a object of instance CuGraphStorage
-
-        See also ``cugraph_dgl.cugraph_storage_from_heterograph``
-        to convert from DGLHeteroGraph to CuGraphStorage
-
-        Parameters
-        ----------
-         data_dict:
-            The dictionary data for constructing a heterogeneous graph.
-            The keys are in the form of string triplets (src_type, edge_type, dst_type),
-            specifying the source node, edge, and destination node types.
-            The values are graph data is a dataframe with 2 columns form of (𝑈,𝑉),
-            where (𝑈[𝑖],𝑉[𝑖]) forms the edge with ID 𝑖.
-
-         num_nodes_dict: dict[str, int]
-            The number of nodes for some node types, which is a
-            dictionary mapping a node type T to the number of T-typed nodes.
-
-        single_gpu: bool
-            Whether to create the cugraph Property Graph
-            on a single GPU or multiple GPUs
-            single GPU = True
-            single GPU = False
-
-        device_id: int
-            If specified, must be the integer ID of the GPU device to have the
-            results being created on
-
-        idtype: Framework-specific device object,
-            The data type for storing the structure-related graph
-            information this can be ``torch.int32`` or ``torch.int64``
-            for PyTorch.
-            Defaults to ``torch.int64`` if pytorch is installed
-
-
-         Examples
-         --------
-         The following example uses `CuGraphStorage` :
-            >>> from cugraph_dgl.cugraph_storage import CuGraphStorage
-            >>> import cudf
-            >>> import torch
-            >>> num_nodes_dict={"drug": 3, "gene": 2, "disease": 1}
-            >>> drug_interacts_drug_df = cudf.DataFrame({"src": [0, 1], "dst": [1, 2]})
-            >>> drug_interacts_gene = cudf.DataFrame({"src": [0, 1], "dst": [0, 1]})
-            >>> drug_treats_disease = cudf.DataFrame({"src": [1], "dst": [0]})
-            >>> data_dict = {("drug", "interacts", "drug"):drug_interacts_drug_df,
-                 ("drug", "interacts", "gene"):drug_interacts_gene,
-                 ("drug", "treats", "disease"):drug_treats_disease }
-            >>> gs = CuGraphStorage(data_dict=data_dict, num_nodes_dict=num_nodes_dict)
-            >>> gs.add_node_data(ntype='drug', feat_name='node_feat',
-                                          feat_obj=torch.as_tensor([0.1, 0.2, 0.3]))
-            >>> gs.add_edge_data(canonical_etype=("drug", "interacts", "drug"),
-                                          feat_name='edge_feat',
-                                          feat_obj=torch.as_tensor([0.2, 0.4]))
-            >>> gs.ntypes
-            ['disease', 'drug', 'gene']
-            >>> gs.etypes
-            ['interacts', 'interacts', 'treats']
-            >>> gs.canonical_etypes
-            [('drug', 'interacts', 'drug'),
-             ('drug', 'interacts', 'gene'),
-             ('drug', 'treats', 'disease')]
-
-            >>> gs.sample_neighbors({'disease':[0]},
-                                    1)
-            Graph(num_nodes={'disease': 1, 'drug': 3, 'gene': 2},
-            num_edges={('drug', 'interacts', 'drug'): 0,
-                       ('drug', 'interacts', 'gene'): 0,
-                       ('drug', 'treats', 'disease'): 1},
-            metagraph=[('drug', 'drug', 'interacts'),
-                       ('drug', 'gene', 'interacts'),
-                       ('drug', 'disease', 'treats')])
-
-            >>> gs.get_node_storage(key='node_feat',
-                                    ntype='drug').fetch([0,1,2])
-            tensor([0.1000, 0.2000, 0.3000], device='cuda:0',
-             dtype=torch.float64)
-
-            >>> es = gs.get_edge_storage(key='edge_feat',
-                                    etype=('drug', 'interacts', 'drug'))
-            >>> es.fetch([0,1])
-            tensor([0.2000, 0.4000], device='cuda:0', dtype=torch.float64)
-        """
-        # Order is very important
-        # do this first before cuda work
-        # Create cuda context on the right gpu,
-        # defaults to gpu-0
-        import numba.cuda as cuda
-
-        cuda.select_device(device_id)
-
-        self.idtype = idtype
-        self.id_np_type = backend_dtype_to_np_dtype_dict[idtype]
-        self.num_nodes_dict = num_nodes_dict
-        self._ntype_offset_d = self.__get_ntype_offset_d(self.num_nodes_dict)
-        # Todo: Can possibly optimize by persisting edge-list
-        # Trade-off memory for run-time
-        self.num_edges_dict = {k: len(v) for k, v in data_dict.items()}
-        self._etype_offset_d = self.__get_etype_offset_d(self.num_edges_dict)
-        self.single_gpu = single_gpu
-
-        self.ndata_storage = FeatureStore(backend="torch")
-        self.ndata = self.ndata_storage.fd
-        self.edata_storage = FeatureStore(backend="torch")
-        self.edata = self.edata_storage.fd
-
-        self._etype_range_d = self.__get_etype_range_d(
-            self._etype_offset_d, self.num_canonical_edges_dict
-        )
-        _edges_dict = add_edge_ids_to_edges_dict(
-            data_dict, self._etype_offset_d, self.id_np_type
-        )
-
-        self._edges_dict = add_node_offset_to_edges_dict(
-            _edges_dict, self._ntype_offset_d
-        )
-
-        # Persist the dataframes so they can be retrieved later
-        # for a multi-GPU workflow.
-        if not single_gpu:
-            for k in list(self._edges_dict.keys()):
-                self._edges_dict[k] = self._edges_dict[k].persist()
-
-        self._etype_id_dict = {
-            etype: etype_id for etype_id, etype in enumerate(self.canonical_etypes)
-        }
-        self.uniform_sampler = None
-
-    def add_node_data(self, feat_obj: Sequence, ntype: str, feat_name: str):
-        """
-        Add node features
-
-        Parameters
-        ----------
-        df : array_like object
-            The node feature to save in feature store
-        ntype : str
-            The node type to be added.
-            For example, if dataframe contains data about users, ntype
-            might be "users".
-        feat_name : str
-            The name of the feature being stored
-        Returns
-        -------
-        None
-        """
-        self.ndata_storage.add_data(
-            feat_obj=feat_obj,
-            type_name=ntype,
-            feat_name=feat_name,
-        )
-
-    def add_edge_data(
-        self,
-        feat_obj: Sequence,
-        canonical_etype: Tuple[str, str, str],
-        feat_name: str,
-    ):
-        """
-        Add edge features
-
-        Parameters
-        ----------
-        feat_obj : array_like object
-            The edge feature to save in feature store
-        canonical_etype : Tuple[(str, str, str)]
-            The edge type to be added
-        feat_name : string
-        Returns
-        -------
-        None
-        """
-        _assert_valid_canonical_etype(canonical_etype)
-        self.edata_storage.add_data(
-            feat_obj=feat_obj,
-            type_name=canonical_etype,
-            feat_name=feat_name,
-        )
-
-    # Sampling Function
-    def sample_neighbors(
-        self,
-        nodes,
-        fanout: int,
-        edge_dir: str = "in",
-        prob: Optional[str] = None,
-        exclude_edges=None,
-        replace: bool = False,
-        output_device=None,
-    ):
-        """
-        Return a DGLGraph which is a subgraph induced by sampling neighboring
-        edges of the given nodes.
-        See ``dgl.sampling.sample_neighbors`` for detailed semantics.
-        Parameters
-        ----------
-        nodes : Tensor or dict[str, Tensor]
-            Node IDs to sample neighbors from.
-            This argument can take a single ID tensor or a dictionary of node
-            types and ID tensors. If a single tensor is given, the graph must
-            only have one type of nodes.
-        fanout : int or dict[etype, int]
-            The number of edges to be sampled for each node on each edge type.
-            This argument can take a single int or a dictionary of edge types
-            and ints. If a single int is given, DGL will sample this number of
-            edges for each node for every edge type.
-            If -1 is given for a single edge type, all the neighboring edges
-            with that edge type will be selected.
-        edge_dir: 'in' or 'out'
-            The direction of edges to import
-        prob : str, optional
-            Feature name used as the (un-normalized) probabilities associated
-            with each neighboring edge of a node.  The feature must have only
-            one element for each edge.
-            The features must be non-negative floats, and the sum of the
-            features of inbound/outbound edges for every node must be positive
-            (though they don't have to sum up to one).  Otherwise, the result
-            will be undefined. If :attr:`prob` is not None, GPU sampling is
-            not supported.
-        exclude_edges: tensor or dict
-            Edge IDs to exclude during sampling neighbors for the seed nodes.
-            This argument can take a single ID tensor or a dictionary of edge
-            types and ID tensors. If a single tensor is given, the graph must
-            only have one type of nodes.
-        replace : bool, optional
-            If True, sample with replacement.
-        output_device : Framework-specific device context object, optional
-            The output device.  Default is the same as the input graph.
-        Returns
-        -------
-        DGLGraph
-            A sampled subgraph with the same nodes as the original graph, but
-            only the sampled neighboring edges.  The induced edge IDs will be
-            in ``edata[dgl.EID]``.
-        """
-        if self.uniform_sampler is None:
-            self.uniform_sampler = DGLUniformSampler(
-                self._edges_dict,
-                self._etype_range_d,
-                self._etype_id_dict,
-                self.single_gpu,
-            )
-
-        if prob is not None:
-            raise NotImplementedError(
-                "prob is not currently supported",
-                " for sample_neighbors in CuGraphStorage",
-            )
-
-        if exclude_edges is not None:
-            raise NotImplementedError(
-                "exclude_edges is not currently supported",
-                " for sample_neighbors in CuGraphStorage",
-            )
-
-        if not isinstance(nodes, dict):
-            if len(self.ntypes) > 1:
-                raise dgl.DGLError(
-                    "Must specify node type when the graph is not homogeneous."
-                )
-            nodes = cp.asarray(nodes)
-            nodes = {self.ntypes[0]: nodes}
-        else:
-            nodes = {
-                k: self.dgl_n_id_to_cugraph_id(F.tensor(n), k) for k, n in nodes.items()
-            }
-            nodes = {k: cp.asarray(F.tensor(n)) for k, n in nodes.items()}
-
-        sampled_obj = self.uniform_sampler.sample_neighbors(
-            nodes,
-            fanout,
-            edge_dir=edge_dir,
-            prob=prob,
-            replace=replace,
-        )
-        # heterograph case
-        if len(self.etypes) > 1:
-            graph_data_d, graph_eid_d = self.__convert_to_dgl_tensor_d(
-                sampled_obj, self.idtype
-            )
-            sampled_graph = dgl.heterograph(
-                data_dict=graph_data_d,
-                num_nodes_dict=self.num_nodes_dict,
-                idtype=self.idtype,
-            )
-            sampled_graph.edata[dgl.EID] = graph_eid_d
-        else:
-            src_ids, dst_ids, edge_ids = sampled_obj
-            src_ids = torch.as_tensor(src_ids, device="cuda")
-            dst_ids = torch.as_tensor(dst_ids, device="cuda")
-            edge_ids = torch.as_tensor(edge_ids, device="cuda")
-            total_number_of_nodes = self.total_number_of_nodes
-            sampled_graph = dgl.graph(
-                (src_ids, dst_ids),
-                num_nodes=total_number_of_nodes,
-                idtype=self.idtype,
-            )
-            sampled_graph.edata[dgl.EID] = edge_ids
-
-        # to device function move the dgl graph to desired devices
-        if output_device is not None:
-            sampled_graph.to(output_device)
-        return sampled_graph
-
-    # Required in Cluster-GCN
-    def subgraph(self, nodes, relabel_nodes=False, output_device=None):
-        """Return a subgraph induced on given nodes.
-        This has the same semantics as ``dgl.node_subgraph``.
-        Parameters
-        ----------
-        nodes : nodes or dict[str, nodes]
-            The nodes to form the subgraph. The allowed nodes formats are:
-            * Int Tensor: Each element is a node ID. The tensor must have the
-             same device type and ID data type as the graph's.
-            * iterable[int]: Each element is a node ID.
-            * Bool Tensor: Each :math:`i^{th}` element is a bool flag
-             indicating whether node :math:`i` is in the subgraph.
-             If the graph is homogeneous, directly pass the above formats.
-             Otherwise, the argument must be a dictionary with keys being
-             node types and values being the node IDs in the above formats.
-        relabel_nodes : bool, optional
-            If True, the extracted subgraph will only have the nodes in the
-            specified node set and it will relabel the nodes in order.
-        output_device : Framework-specific device context object, optional
-            The output device.  Default is the same as the input graph.
-        Returns
-        -------
-        DGLGraph
-            The subgraph.
-        """
-        raise NotImplementedError("subgraph is not implemented yet")
-
-    # Required in Link Prediction
-    # relabel = F we use dgl functions,
-    # relabel = T, we need to delete nodes and relabel
-    def edge_subgraph(self, edges, relabel_nodes=False, output_device=None):
-        """
-        Return a subgraph induced on given edges.
-        This has the same semantics as ``dgl.edge_subgraph``.
-        Parameters
-        ----------
-        edges : edges or dict[(str, str, str), edges]
-            The edges to form the subgraph. The allowed edges formats are:
-            * Int Tensor: Each element is an edge ID. The tensor must have the
-              same device type and ID data type as the graph's.
-            * iterable[int]: Each element is an edge ID.
-            * Bool Tensor: Each :math:`i^{th}` element is a bool flag
-             indicating whether edge :math:`i` is in the subgraph.
-            If the graph is homogeneous, one can directly pass the above
-            formats. Otherwise, the argument must be a dictionary with keys
-            being edge types and values being the edge IDs in the above formats
-        relabel_nodes : bool, optional
-            If True, the extracted subgraph will only have the nodes in the
-            specified node set and it will relabel the nodes in order.
-        output_device : Framework-specific device context object, optional
-            The output device.  Default is the same as the input graph.
-        Returns
-        -------
-        DGLGraph
-            The subgraph.
-        """
-        raise NotImplementedError("edge_subgraph is not implemented yet")
-
-    # Required in Link Prediction negative sampler
-    def find_edges(
-        self, eid, etype: Optional[Tuple[str, str, str]] = None, output_device=None
-    ):
-        """
-        Return the source and destination node ID(s) given the edge ID(s).
-
-        Parameters
-        ----------
-        eid : edge ID(s)
-            The edge IDs. The allowed formats are:
-
-            * ``int``: A single ID.
-            * Int Tensor: Each element is an ID.
-            The tensor must have the same device type
-            and ID data type as the graph's.
-            * iterable[int]: Each element is an ID.
-
-        etype : Tuple[str, str, str]
-            The type name of the edges.
-            Can be omitted if the graph has only one type of edges.
-
-        Returns
-        -------
-        Tensor
-            The source node IDs of the edges.
-            The i-th element is the source node ID of the i-th edge.
-        Tensor
-            The destination node IDs of the edges.
-            The i-th element is the destination node ID of the i-th edge.
-        """
-
-        if etype:
-            src_type, connection_type, dst_type = etype
-        eid = self.dgl_e_id_to_cugraph_id(eid, etype)
-        # TODO: implement below
-        src, dst = self.find_edges(eid, etype)
-        src = torch.as_tensor(src, device="cuda")
-        dst = torch.as_tensor(dst, device="cuda")
-        src = self.cugraph_n_id_to_dgl_id(src, src_type)
-        dst = self.cugraph_n_id_to_dgl_id(dst, dst_type)
-
-        return src, dst
-
-    # Required in Link Prediction negative sampler
-    def global_uniform_negative_sampling(
-        self, num_samples, exclude_self_loops=True, replace=False, etype=None
-    ):
-        """
-        Per source negative sampling as in ``dgl.dataloading.GlobalUniform``
-        """
-        raise NotImplementedError(
-            "global_uniform_negative_sampling not implemented yet"
-        )
-
-    def get_node_storage(self, key: str, ntype: str = None):
-        """
-        Get storage object of node feature of
-        type :attr:`ntype` and name :attr:`key`
-        """
-        if ntype is None:
-            if len(self.ntypes) > 1:
-                raise ValueError(
-                    "ntype must be provided if multiple ntypes are present in the graph"
-                )
-            else:
-                ntype = self.ntype[0]
-        return dgl_FeatureStorage(self.ndata_storage, type_name=ntype, feat_name=key)
-
-    def get_edge_storage(self, key: str, etype: Optional[Tuple[str, str, str]] = None):
-        """
-        Get storage object of edge feature of
-        type :attr:`ntype` and name :attr:`key`
-        """
-        if etype is None:
-            if len(self.etypes) > 1:
-                raise ValueError(
-                    "etype must be provided if multiple etypes are present in the graph"
-                )
-            else:
-                etype = self.etypes[0]
-        return dgl_FeatureStorage(self.edata_storage, type_name=etype, feat_name=key)
-
-    # Number of edges/nodes utils
-    def num_nodes(self, ntype: str = None) -> int:
-        """
-        Return the number of nodes in the graph.
-        Parameters
-        ----------
-        ntype : str, optional
-            The node type name. If given, it returns the number of nodes of the
-            type.
-            If not given (default), it  returns the total number of nodes
-            of all types.
-
-        Returns
-        -------
-        int
-            The number of nodes.
-        """
-        if ntype:
-            return self.num_nodes_dict[ntype]
-        else:
-            return self.total_number_of_nodes
-
-    def number_of_nodes(self, ntype: str = None) -> int:
-        """
-        Return the number of nodes in the graph.
-        Alias of ``num_nodes``
-        Parameters
-        ----------
-        ntype : str, optional
-            The node type name. If given, it returns the number of nodes of the
-            type.
-            If not given (default), it  returns the total number of nodes
-            of all types.
-
-        Returns
-        -------
-        int
-            The number of nodes.
-        """
-        return self.num_nodes(ntype)
-
-    @property
-    def ntypes(self) -> Sequence[str]:
-        """
-        Return all the node type names in the graph.
-
-        Returns
-        -------
-        list[str]
-            All the node type names in a list.
-        """
-        ntypes = list(self.num_nodes_dict.keys())
-        return ntypes
-
-    @property
-    def etypes(self) -> Sequence[str]:
-        """
-        Return all the edge type names in the graph.
-
-        Returns
-        -------
-        list[str]
-            All the edge type names in a list.
-        """
-
-        return [can_etype[1] for can_etype in self.canonical_etypes]
-
-    def num_edges(self, etype: Optional[str] = None) -> int:
-        """
-        Return the number of edges in the graph.
-        Parameters
-        ----------
-        etype:
-
-        Returns
-        -------
-        int
-            The number of edges
-        """
-        if etype:
-            if etype not in self.canonical_etypes:
-                etype = self.get_corresponding_canonical_etype(etype)
-            return self.num_edges_dict[etype]
-        else:
-            return self.total_number_of_edges
-
-    @cached_property
-    def total_number_of_edges(self) -> int:
-        return sum(self.num_edges_dict.values())
-
-    @cached_property
-    def total_number_of_nodes(self) -> int:
-        return sum(self.num_nodes_dict.values())
-
-    @property
-    def num_canonical_edges_dict(self) -> dict[str, int]:
-        return self.num_edges_dict
-
-    @property
-    def canonical_etypes(self) -> Sequence[Tuple[str, str, str]]:
-        return list(self.num_edges_dict.keys())
-
-    @property
-    def device(self):
-        """
-        Get the device of the graph.
-        Returns
-        -------
-        device context
-            The device of the graph, which should be a
-            framework-specific device object (e.g., ``torch.device``).
-        """
-        return torch.cuda.current_device()
-
-    # Index Conversion Utils
-    def get_node_id_offset(self, ntype: str) -> int:
-        """
-        Return the integer offset for node id of type ntype
-        """
-        return self._ntype_offset_d[ntype]
-
-    def get_edge_id_offset(self, canonical_etype: Tuple[str, str, str]) -> int:
-        """
-        Return the integer offset for node id of type etype
-        """
-        _assert_valid_canonical_etype(canonical_etype)
-        return self._etype_offset_d[canonical_etype]
-
-    def dgl_n_id_to_cugraph_id(self, index_t, ntype: str):
-        return index_t + self.get_node_id_offset(ntype)
-
-    def cugraph_n_id_to_dgl_id(self, index_t, ntype: str):
-        return index_t - self.get_node_id_offset(ntype)
-
-    def dgl_e_id_to_cugraph_id(self, index_t, canonical_etype: Tuple[str, str, str]):
-        return index_t + self.get_edge_id_offset(canonical_etype)
-
-    def cugraph_e_id_to_dgl_id(self, index_t, canonical_etype: Tuple[str, str, str]):
-        return index_t - self.get_edge_id_offset(canonical_etype)
-
-    # Methods for getting the offsets per type
-    @staticmethod
-    def __get_etype_offset_d(num_canonical_edges_dict):
-        last_st = 0
-        etype_st_d = {}
-        for etype in sorted(num_canonical_edges_dict.keys()):
-            etype_st_d[etype] = last_st
-            last_st = last_st + num_canonical_edges_dict[etype]
-        return etype_st_d
-
-    @staticmethod
-    def __get_etype_range_d(etype_offset_d, num_canonical_edges_dict):
-        # dict for edge_id_offset_start
-        etype_range_d = {}
-        for etype, st in etype_offset_d.items():
-            etype_range_d[etype] = (st, st + num_canonical_edges_dict[etype])
-        return etype_range_d
-
-    @staticmethod
-    def __get_ntype_offset_d(num_nodes_dict):
-        # dict for node_id_offset_start
-        last_st = 0
-        ntype_st_d = {}
-        for ntype in sorted(num_nodes_dict.keys()):
-            ntype_st_d[ntype] = last_st
-            last_st = last_st + num_nodes_dict[ntype]
-        return ntype_st_d
-
-    def get_corresponding_canonical_etype(self, etype: str) -> str:
-        can_etypes = [
-            can_etype for can_etype in self.canonical_etypes if can_etype[1] == etype
-        ]
-        if len(can_etypes) > 1:
-            raise dgl.DGLError(
-                f'Edge type "{etype}" is ambiguous. Please use canonical'
-                + "edge type in the form of (srctype, etype, dsttype)"
-            )
-        return can_etypes[0]
-
-    def __convert_to_dgl_tensor_d(
-        self,
-        graph_sampled_data_d,
-        o_dtype=None if isinstance(F, MissingModule) else F.int64,
-    ):
-
-        graph_data_d = {}
-        graph_eid_d = {}
-        for canonical_etype, (
-            src,
-            dst,
-            edge_id,
-        ) in graph_sampled_data_d.items():
-            src_type = canonical_etype[0]
-            dst_type = canonical_etype[2]
-
-            src_t = _torch_tensor_from_cp_array(src)
-            dst_t = _torch_tensor_from_cp_array(dst)
-            edge_id_t = _torch_tensor_from_cp_array(edge_id)
-
-            src_t = self.cugraph_n_id_to_dgl_id(src_t, src_type)
-            dst_t = self.cugraph_n_id_to_dgl_id(dst_t, dst_type)
-            edge_id_t = self.cugraph_e_id_to_dgl_id(edge_id_t, canonical_etype)
-            graph_data_d[canonical_etype] = (src_t.to(o_dtype), dst_t.to(o_dtype))
-            graph_eid_d[canonical_etype] = edge_id_t.to(o_dtype)
-
-        return graph_data_d, graph_eid_d
-
-
-def _torch_tensor_from_cp_array(ar):
-    if len(ar) == 0:
-        return torch.as_tensor(ar.get()).to("cuda")
-    return torch.as_tensor(ar, device="cuda")
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
deleted file mode 100644
index 8a2e9cd954d..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph_dgl.dataloading.dataset import (
-    HomogenousBulkSamplerDataset,
-    HeterogenousBulkSamplerDataset,
-)
-
-from cugraph_dgl.dataloading.sampler import Sampler
-from cugraph_dgl.dataloading.neighbor_sampler import NeighborSampler
-
-from cugraph_dgl.dataloading.dask_dataloader import DaskDataLoader
-from cugraph_dgl.dataloading.dataloader import DataLoader as FutureDataLoader
-
-
-def DataLoader(*args, **kwargs):
-    warnings.warn(
-        "DataLoader has been renamed to DaskDataLoader.  "
-        "In Release 24.10, cugraph_dgl.dataloading.FutureDataLoader "
-        "will take over the DataLoader name.",
-        FutureWarning,
-    )
-    return DaskDataLoader(*args, **kwargs)
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
deleted file mode 100644
index e220b93f738..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-import os
-import shutil
-import cugraph_dgl
-import cupy as cp
-import cudf
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import BulkSampler
-from dask.distributed import default_client, Event
-from cugraph_dgl.dataloading import (
-    HomogenousBulkSamplerDataset,
-    HeterogenousBulkSamplerDataset,
-)
-from cugraph_dgl.dataloading.utils.extract_graph_helpers import (
-    create_cugraph_graph_from_edges_dict,
-)
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-
-
-class DaskDataLoader(torch.utils.data.DataLoader):
-    """
-    Sampled graph data loader. Wrap a :class:`~cugraph_dgl.CuGraphStorage` and a
-    :class:`~cugraph_dgl.dataloading.NeighborSampler` into
-    an iterable over mini-batches of samples. cugraph_dgl's ``DataLoader`` extends
-    PyTorch's ``DataLoader`` by handling creation and
-    transmission of graph samples.
-    """
-
-    def __init__(
-        self,
-        graph: cugraph_dgl.CuGraphStorage,
-        indices: torch.Tensor,
-        graph_sampler: cugraph_dgl.dataloading.NeighborSampler,
-        sampling_output_dir: str,
-        batches_per_partition: int = 50,
-        seeds_per_call: int = 200_000,
-        device: torch.device = None,
-        use_ddp: bool = False,
-        ddp_seed: int = 0,
-        batch_size: int = 1024,
-        drop_last: bool = False,
-        shuffle: bool = False,
-        sparse_format: str = "coo",
-        **kwargs,
-    ):
-        """
-        Constructor for DaskDataLoader:
-        -------------------------------
-        graph : CuGraphStorage
-            The graph.
-        indices : Tensor or dict[ntype, Tensor]
-            The set of indices.  It can either be a tensor of
-            integer indices or a dictionary of types and indices.
-            The actual meaning of the indices is defined by the :meth:`sample` method of
-            :attr:`graph_sampler`.
-        graph_sampler : cugraph_dgl.dataloading.NeighborSampler
-            The subgraph sampler.
-        sampling_output_dir: str
-            Output directory to share sampling results in
-        batches_per_partition: int
-            The number of batches of sampling results to write/read
-        seeds_per_call: int
-            The number of seeds to sample at once
-        device : device context, optional
-            The device of the generated MFGs in each iteration, which should be a
-            PyTorch device object (e.g., ``torch.device``).
-            By default this returns the tenors on device with the current
-            cuda context
-        use_ddp : boolean, optional
-            If True, tells the DataLoader to split the training set for each
-            participating process appropriately using
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Overrides the :attr:`sampler` argument of
-            :class:`torch.utils.data.DataLoader`.
-        ddp_seed : int, optional
-            The seed for shuffling the dataset in
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Only effective when :attr:`use_ddp` is True.
-        batch_size: int
-            Batch size.
-        sparse_format: str, default = "coo"
-            The sparse format of the emitted sampled graphs. Choose between "csc"
-            and "coo". When using "csc", the graphs are of type
-            cugraph_dgl.nn.SparseGraph.
-        kwargs : dict
-            Key-word arguments to be passed to the parent PyTorch
-            :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
-                - ``batch_size`` (int): The number of indices in each batch.
-                - ``drop_last`` (bool): Whether to drop the last incomplete
-                                        batch.
-                - ``shuffle`` (bool): Whether to randomly shuffle the
-                                      indices at each epoch
-        Examples
-        --------
-        To train a 3-layer GNN for node classification on a set of nodes
-        ``train_nid`` on a homogeneous graph where each node takes messages
-        from 15 neighbors on the first layer, 10 neighbors on the second, and
-        5 neighbors on the third:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for input_nodes, output_nodes, blocks in dataloader:
-        ...     train_on(input_nodes, output_nodes, blocks)
-        **Using with Distributed Data Parallel**
-        If you are using PyTorch's distributed training (e.g. when using
-        :mod:`torch.nn.parallel.DistributedDataParallel`),
-        you can train the model by turning
-        on the `use_ddp` option:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler, use_ddp=True,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for epoch in range(start_epoch, n_epochs):
-        ...     for input_nodes, output_nodes, blocks in dataloader:
-        ...
-        """
-        if sparse_format not in ["coo", "csc"]:
-            raise ValueError(
-                f"sparse_format must be one of 'coo', 'csc', "
-                f"but got {sparse_format}."
-            )
-        self.sparse_format = sparse_format
-
-        self.ddp_seed = ddp_seed
-        self.use_ddp = use_ddp
-        self.shuffle = shuffle
-        self.drop_last = drop_last
-        self.graph_sampler = graph_sampler
-        worker_init_fn = dgl.dataloading.WorkerInitWrapper(
-            kwargs.get("worker_init_fn", None)
-        )
-        self.other_storages = {}
-        self.epoch_number = 0
-        self._batch_size = batch_size
-        self._sampling_output_dir = sampling_output_dir
-        self._batches_per_partition = batches_per_partition
-        self._seeds_per_call = seeds_per_call
-        self._rank = None
-
-        indices = _dgl_idx_to_cugraph_idx(indices, graph)
-
-        self.tensorized_indices_ds = dgl.dataloading.create_tensorized_dataset(
-            indices,
-            batch_size,
-            drop_last,
-            use_ddp,
-            ddp_seed,
-            shuffle,
-            kwargs.get("persistent_workers", False),
-        )
-
-        if len(graph.ntypes) <= 1:
-            self.cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
-                total_number_of_nodes=graph.total_number_of_nodes,
-                edge_dir=self.graph_sampler.edge_dir,
-                sparse_format=sparse_format,
-            )
-        else:
-            etype_id_to_etype_str_dict = {v: k for k, v in graph._etype_id_dict.items()}
-
-            self.cugraph_dgl_dataset = HeterogenousBulkSamplerDataset(
-                num_nodes_dict=graph.num_nodes_dict,
-                etype_id_dict=etype_id_to_etype_str_dict,
-                etype_offset_dict=graph._etype_offset_d,
-                ntype_offset_dict=graph._ntype_offset_d,
-                edge_dir=self.graph_sampler.edge_dir,
-            )
-
-        if use_ddp:
-            rank = torch.distributed.get_rank()
-            client = default_client()
-            self._graph_creation_event = Event("cugraph_dgl_load_mg_graph_event")
-            if rank == 0:
-                G = create_cugraph_graph_from_edges_dict(
-                    edges_dict=graph._edges_dict,
-                    etype_id_dict=graph._etype_id_dict,
-                    edge_dir=graph_sampler.edge_dir,
-                )
-                client.publish_dataset(cugraph_dgl_mg_graph_ds=G)
-                self._graph_creation_event.set()
-            else:
-                if self._graph_creation_event.wait(timeout=1000):
-                    G = client.get_dataset("cugraph_dgl_mg_graph_ds")
-                else:
-                    raise RuntimeError(
-                        f"Fetch cugraph_dgl_mg_graph_ds to worker_id {rank}",
-                        "from worker_id 0 failed",
-                    )
-        else:
-            rank = 0
-            G = create_cugraph_graph_from_edges_dict(
-                edges_dict=graph._edges_dict,
-                etype_id_dict=graph._etype_id_dict,
-                edge_dir=graph_sampler.edge_dir,
-            )
-
-        self._rank = rank
-        self._cugraph_graph = G
-        super().__init__(
-            self.cugraph_dgl_dataset,
-            batch_size=None,
-            worker_init_fn=worker_init_fn,
-            collate_fn=lambda x: x,  # Hack to prevent collating
-            **kwargs,
-        )
-
-    def __iter__(self):
-        output_dir = os.path.join(
-            self._sampling_output_dir, "epoch_" + str(self.epoch_number)
-        )
-        kwargs = {}
-        if isinstance(self.cugraph_dgl_dataset, HomogenousBulkSamplerDataset):
-            kwargs["deduplicate_sources"] = True
-            kwargs["prior_sources_behavior"] = "carryover"
-            kwargs["renumber"] = True
-
-            if self.sparse_format == "csc":
-                kwargs["compression"] = "CSR"
-                kwargs["compress_per_hop"] = True
-                # The following kwargs will be deprecated in uniform sampler.
-                kwargs["use_legacy_names"] = False
-                kwargs["include_hop_column"] = False
-
-        else:
-            kwargs["deduplicate_sources"] = False
-            kwargs["prior_sources_behavior"] = None
-            kwargs["renumber"] = False
-
-        bs = BulkSampler(
-            output_path=output_dir,
-            batch_size=self._batch_size,
-            graph=self._cugraph_graph,
-            batches_per_partition=self._batches_per_partition,
-            seeds_per_call=self._seeds_per_call,
-            fanout_vals=self.graph_sampler._reversed_fanout_vals,
-            with_replacement=self.graph_sampler.replace,
-            **kwargs,
-        )
-
-        if self.shuffle:
-            self.tensorized_indices_ds.shuffle()
-
-        batch_df = create_batch_df(self.tensorized_indices_ds)
-        bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
-        bs.flush()
-        self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
-        self.epoch_number = self.epoch_number + 1
-        return super().__iter__()
-
-    def __del__(self):
-        if self.use_ddp:
-            torch.distributed.barrier()
-        if self._rank == 0:
-            if self.use_ddp:
-                client = default_client()
-                client.unpublish_dataset("cugraph_dgl_mg_graph_ds")
-                self._graph_creation_event.clear()
-            _clean_directory(self._sampling_output_dir)
-
-
-def get_batch_id_series(n_output_rows: int, batch_size: int) -> cudf.Series:
-    num_batches = (n_output_rows + batch_size - 1) // batch_size
-    print(f"Number of batches = {num_batches}".format(num_batches))
-    batch_ar = cp.arange(0, num_batches).repeat(batch_size)
-    batch_ar = batch_ar[0:n_output_rows].astype(cp.int32)
-    return cudf.Series(batch_ar)
-
-
-def create_batch_df(dataset: torch.Tensor) -> cudf.DataFrame:
-    batch_id_ls = []
-    indices_ls = []
-    for batch_id, b_indices in enumerate(dataset):
-        if isinstance(b_indices, dict):
-            b_indices = torch.cat(list(b_indices.values()))
-        batch_id_ar = cp.full(shape=len(b_indices), fill_value=batch_id, dtype=cp.int32)
-        batch_id_ls.append(batch_id_ar)
-        indices_ls.append(b_indices)
-
-    batch_id_ar = cp.concatenate(batch_id_ls)
-    indices_ar = cp.asarray(torch.concat(indices_ls))
-    batches_df = cudf.DataFrame(
-        {
-            "start": indices_ar,
-            "batch_id": batch_id_ar,
-        }
-    )
-    return batches_df
-
-
-def _dgl_idx_to_cugraph_idx(idx, cugraph_gs):
-    if not isinstance(idx, dict):
-        if len(cugraph_gs.ntypes) > 1:
-            raise dgl.DGLError(
-                "Must specify node type when the graph is not homogeneous."
-            )
-        return idx
-    else:
-        return {k: cugraph_gs.dgl_n_id_to_cugraph_id(n, k) for k, n in idx.items()}
-
-
-def _clean_directory(path):
-    """param <path> could either be relative or absolute."""
-    if os.path.isfile(path):
-        os.remove(path)  # remove the file
-    elif os.path.isdir(path):
-        shutil.rmtree(path)  # remove dir and all contains
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
deleted file mode 100644
index 4f36353cb18..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Optional, Dict
-
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl.typing import TensorType
-from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-
-
-class DataLoader:
-    """
-    Duck-typed version of dgl.dataloading.DataLoader
-    """
-
-    def __init__(
-        self,
-        graph: "cugraph_dgl.Graph",
-        indices: TensorType,
-        graph_sampler: "cugraph_dgl.dataloading.Sampler",
-        device: Union[int, str, "torch.device"] = None,
-        use_ddp: bool = False,
-        ddp_seed: int = 0,
-        batch_size: int = 1,
-        drop_last: bool = False,
-        shuffle: bool = False,
-        use_prefetch_thread: Optional[bool] = None,
-        use_alternate_streams: Optional[bool] = None,
-        pin_prefetcher: Optional[bool] = None,
-        use_uva=False,
-        gpu_cache: Dict[str, Dict[str, int]] = None,
-        output_format: str = "dgl.Block",
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-        graph: cugraph_dgl.Graph
-            The graph being sampled.  Can be a single-GPU or multi-GPU graph.
-        indices: TensorType
-            The seed nodes for sampling.  If use_ddp=True, then all seed
-            nodes should be provided.  If use_ddp=False, then only the seed
-            nodes assigned to this worker should be provided.
-        graph_sampler: cugraph_dgl.dataloading.Sampler
-            The sampler responsible for sampling the graph and producing
-            output minibatches.
-        device: Union[int, str, torch.device]
-            Optional.
-            The device assigned to this loader ('cpu', 'cuda' or device id).
-            Defaults to the current device.
-        use_ddp: bool
-            Optional (default=False).
-            If true, this argument will assume the entire list of input seed
-            nodes is being passed to each worker, and will appropriately
-            split and shuffle the list.
-            It false, then it is assumed that the list of input seed nodes
-            is comprised of the union of the lists provided to each worker.
-        ddp_seed: int
-            Optional (default=0).
-            The seed used for dividing and shuffling data if use_ddp=True.
-            Has no effect if use_ddp=False.
-        use_uva: bool
-            Optional (default=False).
-            Whether to use pinned memory and unified virtual addressing
-            to perform sampling.
-            This argument is ignored by cuGraph-DGL.
-        use_prefetch_thread: bool
-            Optional (default=False).
-            Whether to spawn a new thread for feature fetching.
-            This argument is ignored by cuGraph-DGL.
-        use_alternate_streams: bool
-            Optional (default=False).
-            Whether to perform feature fetching on a separate stream.
-            This argument is ignored by cuGraph-DGL.
-        pin_prefetcher: bool
-            Optional (default=False).
-            Whether to pin the feature tensors.
-            This argument is currently ignored by cuGraph-DGL.
-        gpu_cache: Dict[str, Dict[str, int]]
-            List of features to cache using HugeCTR.
-            This argument is not supported by cuGraph-DGL and
-            will result in an error.
-        output_format: str
-            Optional (default="dgl.Block").
-            The output format for blocks.
-            Can be either "dgl.Block" or "cugraph_dgl.nn.SparseGraph".
-        """
-
-        if use_uva:
-            warnings.warn("The 'use_uva' argument is ignored by cuGraph-DGL.")
-        if use_prefetch_thread:
-            warnings.warn(
-                "The 'use_prefetch_thread' argument is ignored by cuGraph-DGL."
-            )
-        if use_alternate_streams:
-            warnings.warn(
-                "The 'use_alternate_streams' argument is ignored by cuGraph-DGL."
-            )
-        if pin_prefetcher:
-            warnings.warn("The 'pin_prefetcher' argument is ignored by cuGraph-DGL.")
-        if gpu_cache:
-            raise ValueError(
-                "HugeCTR is not supported by cuGraph-DGL. "
-                "Consider using WholeGraph for feature storage"
-                " in cugraph_dgl.Graph instead."
-            )
-
-        indices = _cast_to_torch_tensor(indices)
-
-        self.__dataset = dgl.dataloading.create_tensorized_dataset(
-            indices,
-            batch_size,
-            drop_last,
-            use_ddp,
-            ddp_seed,
-            shuffle,
-            kwargs.get("persistent_workers", False),
-        )
-
-        self.__output_format = output_format
-        self.__sampler = graph_sampler
-        self.__batch_size = batch_size
-        self.__graph = graph
-        self.__device = device
-
-    @property
-    def _batch_size(self):
-        return self.__batch_size
-
-    @property
-    def dataset(
-        self,
-    ) -> Union[
-        "dgl.dataloading.dataloader.TensorizedDataset",
-        "dgl.dataloading.dataloader.DDPTensorizedDataset",
-    ]:
-        return self.__dataset
-
-    def __iter__(self):
-        # TODO move to the correct device (rapidsai/cugraph-gnn#11)
-        return self.__sampler.sample(
-            self.__graph,
-            self.__dataset,
-            batch_size=self.__batch_size,
-        )
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
deleted file mode 100644
index f6fe38fe9f8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from typing import Tuple, Dict, Optional, List, Union
-
-import os
-import cudf
-from cugraph.utilities.utils import import_optional
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    create_homogeneous_sampled_graphs_from_dataframe,
-    create_heterogeneous_sampled_graphs_from_dataframe,
-    create_homogeneous_sampled_graphs_from_dataframe_csc,
-)
-
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-
-
-# Todo: maybe should switch to __iter__
-class HomogenousBulkSamplerDataset(torch.utils.data.Dataset):
-    def __init__(
-        self,
-        total_number_of_nodes: int,
-        edge_dir: str,
-        return_type: str = "dgl.Block",
-        sparse_format: str = "coo",
-    ):
-        if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
-            raise ValueError(
-                "return_type must be either 'dgl.Block' or "
-                "'cugraph_dgl.nn.SparseGraph'."
-            )
-        # TODO: Deprecate `total_number_of_nodes`
-        # as it is no longer needed
-        # in the next release
-        self.total_number_of_nodes = total_number_of_nodes
-        self.edge_dir = edge_dir
-        self.sparse_format = sparse_format
-        self._current_batch_fn = None
-        self._input_files = None
-        self._return_type = return_type
-
-    def __len__(self):
-        return self.num_batches
-
-    def __getitem__(self, idx: int):
-        if self._input_files is None:
-            raise dgl.DGLError(
-                "Please set input files by calling `set_input_files` "
-                "before trying to fetch a sample"
-            )
-
-        fn, batch_offset = self._batch_to_fn_d[idx]
-        if fn != self._current_batch_fn:
-            # Remove current batches to free up memory
-            # before loading new batches
-            if hasattr(self, "_current_batches"):
-                del self._current_batches
-            if self.sparse_format == "csc":
-                df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True)
-                self._current_batches = (
-                    create_homogeneous_sampled_graphs_from_dataframe_csc(df)
-                )
-            else:
-                df = _load_sampled_file(dataset_obj=self, fn=fn)
-                self._current_batches = (
-                    create_homogeneous_sampled_graphs_from_dataframe(
-                        sampled_df=df,
-                        edge_dir=self.edge_dir,
-                        return_type=self._return_type,
-                    )
-                )
-        current_offset = idx - batch_offset
-        return self._current_batches[current_offset]
-
-    def set_input_files(
-        self,
-        input_directory: Optional[str] = None,
-        input_file_paths: Optional[List[str]] = None,
-    ):
-        """
-        Set input files that have been created by the `cugraph.gnn.BulkSampler`
-        Parameters
-        ----------
-        input_directory: str
-           input_directory which contains all the files that will be
-           loaded by HomogenousBulkSamplerDataset
-        input_file_paths: List[str]
-            File paths that will be loaded by the HomogenousBulkSamplerDataset
-        """
-        _set_input_files(
-            self, input_directory=input_directory, input_file_paths=input_file_paths
-        )
-
-
-class HeterogenousBulkSamplerDataset(torch.utils.data.Dataset):
-    def __init__(
-        self,
-        num_nodes_dict: Dict[str, int],
-        etype_id_dict: Dict[int, Tuple[str, str, str]],
-        etype_offset_dict: Dict[Tuple[str, str, str], int],
-        ntype_offset_dict: Dict[str, int],
-        edge_dir: str = "in",
-    ):
-        self.num_nodes_dict = num_nodes_dict
-        self.etype_id_dict = etype_id_dict
-        self.etype_offset_dict = etype_offset_dict
-        self.ntype_offset_dict = ntype_offset_dict
-        self.edge_dir = edge_dir
-        self._current_batch_fn = None
-        self._input_files = None
-
-    def __len__(self):
-        return self.num_batches
-
-    def __getitem__(self, idx):
-        if self._input_files is None:
-            raise dgl.DGLError(
-                "Please set input files by calling `set_input_files` "
-                "before trying to fetch a sample"
-            )
-
-        fn, batch_offset = self._batch_to_fn_d[idx]
-        if fn != self._current_batch_fn:
-            df = _load_sampled_file(dataset_obj=self, fn=fn)
-            self._current_batches = create_heterogeneous_sampled_graphs_from_dataframe(
-                sampled_df=df,
-                num_nodes_dict=self.num_nodes_dict,
-                etype_id_dict=self.etype_id_dict,
-                etype_offset_dict=self.etype_offset_dict,
-                ntype_offset_dict=self.ntype_offset_dict,
-                edge_dir=self.edge_dir,
-            )
-            del df
-
-        current_offset = idx - batch_offset
-        return self._current_batches[current_offset]
-
-    def set_input_files(
-        self,
-        input_directory: Optional[str] = None,
-        input_file_paths: Optional[List[str]] = None,
-    ):
-        """
-        Set input files that have been created by the `cugraph.gnn.BulkSampler`
-        Parameters
-        ----------
-        input_directory: str
-            input_directory which contains all the files that will be
-            loaded by HeterogenousBulkSamplerDataset
-        input_file_paths: List[str]
-            File names that will be loaded by the HeterogenousBulkSamplerDataset
-        """
-        _set_input_files(
-            self, input_directory=input_directory, input_file_paths=input_file_paths
-        )
-
-
-def _load_sampled_file(dataset_obj, fn, skip_rename=False):
-    df = cudf.read_parquet(os.path.join(fn))
-    if dataset_obj.edge_dir == "in" and not skip_rename:
-        df.rename(
-            columns={"sources": "destinations", "destinations": "sources"},
-            inplace=True,
-        )
-    dataset_obj._current_batch_fn = fn
-    return df
-
-
-def get_batch_start_end(fn):
-    batch_str = fn.split("batch=")[1]
-    batch_start, batch_end = batch_str.split("-")
-    batch_end = batch_end.split(".parquet")[0]
-    return int(batch_start), int(batch_end)
-
-
-def get_batch_to_fn_d(files):
-    batch_to_fn_d = {}
-    batch_id = 0
-    for fn in files:
-        start, end = get_batch_start_end(fn)
-        batch_offset = batch_id
-        for _ in range(start, end + 1):
-            batch_to_fn_d[batch_id] = fn, batch_offset
-            batch_id += 1
-    return batch_to_fn_d
-
-
-def _set_input_files(
-    dataset_obj: Union[HomogenousBulkSamplerDataset, HeterogenousBulkSamplerDataset],
-    input_directory: Optional[str] = None,
-    input_file_paths: Optional[List[str]] = None,
-) -> None:
-
-    if input_directory is None and input_file_paths is None:
-        raise ValueError("input_files or input_file_paths must be set")
-
-    if (input_directory is not None) and (input_file_paths is not None):
-        raise ValueError("Only one of input_directory or input_file_paths must be set")
-
-    if input_file_paths:
-        dataset_obj._input_files = input_file_paths
-    if input_directory:
-        dataset_obj._input_files = [fp.path for fp in os.scandir(input_directory)]
-    dataset_obj._batch_to_fn_d = get_batch_to_fn_d(dataset_obj._input_files)
-    dataset_obj.num_batches = len(dataset_obj._batch_to_fn_d)
-    dataset_obj._current_batch_fn = None
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
deleted file mode 100644
index ecc51006995..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import warnings
-import tempfile
-
-from typing import Sequence, Optional, Union, List, Tuple, Iterator
-
-from cugraph.gnn import UniformNeighborSampler, BiasedNeighborSampler, DistSampleWriter
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl.typing import DGLSamplerOutput
-from cugraph_dgl.dataloading.sampler import Sampler, HomogeneousSampleReader
-
-torch = import_optional("torch")
-
-
-class NeighborSampler(Sampler):
-    """Sampler that builds computational dependency of node representations via
-    neighbor sampling for multilayer GNN.
-    This sampler will make every node gather messages from a fixed number of neighbors
-    per edge type.  The neighbors are picked uniformly.
-    Parameters
-    ----------
-    fanouts_per_layer : int
-        List of neighbors to sample for each GNN layer, with the i-th
-        element being the fanout for the i-th GNN layer.
-        If -1 is provided then all inbound/outbound edges
-        of that edge type will be included.
-    edge_dir : str, default ``'in'``
-        Can be either ``'in' `` where the neighbors will be sampled according to
-        incoming edges, or ``'out'`` for outgoing edges
-    replace : bool, default False
-        Whether to sample with replacement
-    Examples
-    --------
-    **Node classification**
-    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
-    a homogeneous graph where each node takes messages from 5, 10, 15 neighbors for
-    the first, second, and third layer respectively (assuming the backend is PyTorch):
-    >>> sampler = cugraph_dgl.dataloading.NeighborSampler([5, 10, 15])
-    >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-    ...     g, train_nid, sampler,
-    ...     batch_size=1024, shuffle=True)
-    >>> for input_nodes, output_nodes, blocks in dataloader:
-    ...     train_on(blocks)
-    """
-
-    def __init__(
-        self,
-        fanouts_per_layer: Sequence[int],
-        edge_dir: str = "in",
-        replace: bool = False,
-        prob: Optional[str] = None,
-        mask: Optional[str] = None,
-        prefetch_node_feats: Optional[Union[List[str], dict[str, List[str]]]] = None,
-        prefetch_edge_feats: Optional[
-            Union[List[str], dict[Tuple[str, str, str], List[str]]]
-        ] = None,
-        prefetch_labels: Optional[Union[List[str], dict[str, List[str]]]] = None,
-        output_device: Optional[Union["torch.device", int, str]] = None,
-        fused: Optional[bool] = None,
-        sparse_format="csc",
-        output_format="dgl.Block",
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-        fanouts_per_layer: Sequence[int]
-            The number of neighbors to sample per layer.
-        edge_dir: str
-            Optional (default='in').
-            The direction to traverse edges.
-        replace: bool
-            Optional (default=False).
-            Whether to sample with replacement.
-        prob: str
-            Optional.
-            If provided, the probability of each neighbor being
-            sampled is proportional to the edge feature
-            with the given name.  Mutually exclusive with mask.
-        mask: str
-            Optional.
-            If proivided, only neighbors where the edge mask
-            with the given name is True can be selected.
-            Mutually exclusive with prob.
-            Currently unsupported.
-        prefetch_node_feats: Union[List[str], dict[str, List[str]]]
-            Optional.
-            Currently ignored by cuGraph-DGL.
-        prefetch_edge_feats: Union[List[str], dict[Tuple[str, str, str], List[str]]]
-            Optional.
-            Currently ignored by cuGraph-DGL.
-        prefetch_labels: Union[List[str], dict[str, List[str]]]
-            Optional.
-            Currently ignored by cuGraph-DGL.
-        output_device: Union[torch.device, int, str]
-            Optional.
-            Output device for samples. Defaults to the current device.
-        fused: bool
-            Optional.
-            This argument is ignored by cuGraph-DGL.
-        sparse_format: str
-            Optional (default = "coo").
-            The sparse format of the emitted sampled graphs.
-            Currently, only "csc" is supported.
-        output_format: str
-            Optional (default = "dgl.Block")
-            The output format of the emitted sampled graphs.
-            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
-        **kwargs
-            Keyword arguments for the underlying cuGraph distributed sampler
-            and writer (directory, batches_per_partition, format,
-            local_seeds_per_call).
-        """
-
-        if mask:
-            raise NotImplementedError(
-                "Edge masking is currently unsupported by cuGraph-DGL"
-            )
-        if prefetch_edge_feats:
-            warnings.warn("'prefetch_edge_feats' is ignored by cuGraph-DGL")
-        if prefetch_node_feats:
-            warnings.warn("'prefetch_node_feats' is ignored by cuGraph-DGL")
-        if prefetch_labels:
-            warnings.warn("'prefetch_labels' is ignored by cuGraph-DGL")
-        if fused:
-            warnings.warn("'fused' is ignored by cuGraph-DGL")
-
-        self.__prob_attr = prob
-
-        self.fanouts = fanouts_per_layer
-        reverse_fanouts = fanouts_per_layer.copy()
-        reverse_fanouts.reverse()
-        self._reversed_fanout_vals = reverse_fanouts
-
-        self.edge_dir = edge_dir
-        self.replace = replace
-        self.__kwargs = kwargs
-
-        super().__init__(
-            sparse_format=sparse_format,
-            output_format=output_format,
-        )
-
-    def sample(
-        self,
-        g: "cugraph_dgl.Graph",
-        indices: Iterator["torch.Tensor"],
-        batch_size: int = 1,
-    ) -> Iterator[DGLSamplerOutput]:
-        kwargs = dict(**self.__kwargs)
-
-        directory = kwargs.pop("directory", None)
-        if directory is None:
-            warnings.warn("Setting a directory to store samples is recommended.")
-            self._tempdir = tempfile.TemporaryDirectory()
-            directory = self._tempdir.name
-
-        writer = DistSampleWriter(
-            directory=directory,
-            batches_per_partition=kwargs.pop("batches_per_partition", 256),
-            format=kwargs.pop("format", "parquet"),
-        )
-
-        sampling_clx = (
-            UniformNeighborSampler
-            if self.__prob_attr is None
-            else BiasedNeighborSampler
-        )
-
-        ds = sampling_clx(
-            g._graph(self.edge_dir, prob_attr=self.__prob_attr),
-            writer,
-            compression="CSR",
-            fanout=self._reversed_fanout_vals,
-            prior_sources_behavior="carryover",
-            deduplicate_sources=True,
-            compress_per_hop=True,
-            with_replacement=self.replace,
-            **kwargs,
-        )
-
-        if g.is_homogeneous:
-            indices = torch.concat(list(indices))
-            reader = ds.sample_from_nodes(indices.long(), batch_size=batch_size)
-            return HomogeneousSampleReader(reader, self.output_format, self.edge_dir)
-
-        raise ValueError(
-            "Sampling heterogeneous graphs is currently"
-            " unsupported in the non-dask API"
-        )
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
deleted file mode 100644
index 7ea608e7e53..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Iterator, Dict, Tuple, List, Union
-
-import cugraph_dgl
-from cugraph_dgl.nn import SparseGraph
-from cugraph_dgl.typing import DGLSamplerOutput
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    create_homogeneous_sampled_graphs_from_tensors_csc,
-)
-
-
-from cugraph.utilities.utils import import_optional
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-class SampleReader:
-    """
-    Iterator that processes results from the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self,
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]],
-        output_format: str = "dgl.Block",
-    ):
-        """
-        Constructs a new SampleReader.
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The iterator responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        """
-        self.__output_format = output_format
-        self.__base_reader = base_reader
-        self.__num_samples_remaining = 0
-        self.__index = 0
-
-    @property
-    def output_format(self) -> str:
-        return self.__output_format
-
-    def __next__(self) -> DGLSamplerOutput:
-        if self.__num_samples_remaining == 0:
-            # raw_sample_data is already a dict of tensors
-            self.__raw_sample_data, start_inclusive, end_inclusive = next(
-                self.__base_reader
-            )
-
-            self.__decoded_samples = self._decode_all(self.__raw_sample_data)
-            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
-            self.__index = 0
-
-        out = self.__decoded_samples[self.__index]
-        self.__index += 1
-        self.__num_samples_remaining -= 1
-        return out
-
-    def _decode_all(self) -> List[DGLSamplerOutput]:
-        raise NotImplementedError("Must be implemented by subclass")
-
-    def __iter__(self) -> DGLSamplerOutput:
-        return self
-
-
-class HomogeneousSampleReader(SampleReader):
-    """
-    Subclass of SampleReader that reads DGL homogeneous output samples
-    produced by the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self,
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]],
-        output_format: str = "dgl.Block",
-        edge_dir="in",
-    ):
-        """
-        Constructs a new HomogeneousSampleReader
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The reader responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        output_format: str
-            The output format for blocks (either "dgl.Block" or
-            "cugraph_dgl.nn.SparseGraph").
-        edge_dir: str
-            The direction sampling was performed in ("in" or "out").
-        """
-
-        self.__edge_dir = edge_dir
-        super().__init__(base_reader, output_format=output_format)
-
-    def __decode_csc(
-        self, raw_sample_data: Dict[str, "torch.Tensor"]
-    ) -> List[DGLSamplerOutput]:
-        return create_homogeneous_sampled_graphs_from_tensors_csc(
-            raw_sample_data, output_format=self.output_format
-        )
-
-    def __decode_coo(
-        self, raw_sample_data: Dict[str, "torch.Tensor"]
-    ) -> List[DGLSamplerOutput]:
-        raise NotImplementedError(
-            "COO format is currently unsupported in the non-dask API"
-        )
-
-    def _decode_all(
-        self, raw_sample_data: Dict[str, "torch.Tensor"]
-    ) -> List[DGLSamplerOutput]:
-        if "major_offsets" in raw_sample_data:
-            return self.__decode_csc(raw_sample_data)
-        else:
-            return self.__decode_coo(raw_sample_data)
-
-
-class Sampler:
-    """
-    Base sampler class for all cugraph-DGL samplers.
-    """
-
-    def __init__(self, sparse_format: str = "csc", output_format="dgl.Block"):
-        """
-        Parameters
-        ----------
-        sparse_format: str
-            Optional (default = "coo").
-            The sparse format of the emitted sampled graphs.
-            Currently, only "csc" is supported.
-        output_format: str
-            Optional (default = "dgl.Block")
-            The output format of the emitted sampled graphs.
-            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
-        """
-
-        if sparse_format != "csc":
-            raise ValueError("Only CSC format is supported at this time")
-
-        self.__output_format = output_format
-
-    @property
-    def output_format(self):
-        return self.__output_format
-
-    @property
-    def sparse_format(self):
-        return self.__sparse_format
-
-    def sample(
-        self,
-        g: cugraph_dgl.Graph,
-        indices: Iterator["torch.Tensor"],
-        batch_size: int = 1,
-    ) -> Iterator[
-        Tuple["torch.Tensor", "torch.Tensor", List[Union[SparseGraph, "dgl.Block"]]]
-    ]:
-        """
-        Samples the graph.
-
-        Parameters
-        ----------
-        g: cugraph_dgl.Graph
-            The graph being sampled.
-        indices: TensorType
-            The node ids of seed nodes where sampling will initiate from.
-        batch_size: int
-            The number of seed nodes per batch.
-
-        Returns
-        -------
-        Iterator[DGLSamplerOutput]
-            Iterator over batches.  The returned tuples are in standard
-            DGL format: (input nodes, output nodes, blocks) where input
-            nodes are the renumbered input nodes, output nodes are
-            the renumbered output nodes, and blocks are the output graphs
-            for each hop.
-        """
-
-        raise NotImplementedError("Must be implemented by subclass")
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/__init__.py
deleted file mode 100644
index a1dd01f33d4..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
deleted file mode 100644
index 0d3d5823097..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from typing import Tuple, Dict, Union
-import cugraph
-import cudf
-import dask_cudf
-import numpy as np
-
-
-def create_cugraph_graph_from_edges_dict(
-    edges_dict: Dict[Tuple(str, str, str), Union[dask_cudf.DataFrame, cudf.DataFrame]],
-    etype_id_dict: Dict[Dict[Tuple(str, str, str)] : int],
-    edge_dir: str,
-):
-    if edge_dir == "in":
-        edges_dict = {k: reverse_edges(df) for k, df in edges_dict.items()}
-    if len(edges_dict) > 1:
-        has_multiple_etypes = True
-        edges_dict = {
-            k: add_etype_id(df, etype_id_dict[k]) for k, df in edges_dict.items()
-        }
-    else:
-        has_multiple_etypes = False
-
-    edges_dfs = list(edges_dict.values())
-    del edges_dict
-    if isinstance(edges_dfs[0], dask_cudf.DataFrame):
-        edges_df = dask_cudf.concat(edges_dfs, ignore_index=True)
-    else:
-        edges_df = cudf.concat(edges_dfs, ignore_index=True)
-    del edges_dfs
-
-    G = cugraph.MultiGraph(directed=True)
-    if isinstance(edges_df, dask_cudf.DataFrame):
-        g_creation_f = G.from_dask_cudf_edgelist
-    else:
-        g_creation_f = G.from_cudf_edgelist
-
-    if has_multiple_etypes:
-        edge_etp = "etp"
-    else:
-        edge_etp = None
-
-    g_creation_f(
-        edges_df,
-        source="_SRC_",
-        destination="_DST_",
-        weight=None,
-        edge_id="_EDGE_ID_",
-        edge_type=edge_etp,
-        renumber=True,
-    )
-    return G
-
-
-def reverse_edges(df: Union[dask_cudf.DataFrame, cudf.DataFrame]):
-    return df.rename(columns={"_SRC_": "_DST_", "_DST_": "_SRC_"})
-
-
-def add_etype_id(df: Union[dask_cudf.DataFrame, cudf.DataFrame], etype_id: int):
-    df["etp"] = np.int32(etype_id)
-    return df
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
deleted file mode 100644
index 3b7e4502134..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from typing import List, Tuple, Dict, Optional
-from collections import defaultdict
-import cudf
-from cugraph.utilities.utils import import_optional
-from cugraph_dgl.nn import SparseGraph
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-cugraph_dgl = import_optional("cugraph_dgl")
-
-
-def cast_to_tensor(ser: cudf.Series):
-    if len(ser) == 0:
-        # Empty series can not be converted to pytorch cuda tensor
-        t = torch.from_numpy(ser.values.get())
-        return t.to("cuda")
-
-    return torch.as_tensor(ser.values, device="cuda")
-
-
-def _split_tensor(t, split_indices):
-    """
-    Split a tensor into a list of tensors based on split_indices.
-    """
-    # TODO: Switch to something below
-    # return [t[i:j] for i, j in zip(split_indices[:-1], split_indices[1:])]
-    if split_indices.device.type != "cpu":
-        split_indices = split_indices.to("cpu")
-    return torch.tensor_split(t, split_indices)
-
-
-def _get_source_destination_range(sampled_df):
-    o = sampled_df.groupby(["batch_id", "hop_id"], as_index=True).agg(
-        {"sources": "max", "destinations": "max"}
-    )
-    o.rename(
-        columns={"sources": "sources_range", "destinations": "destinations_range"},
-        inplace=True,
-    )
-    d = o.to_dict(orient="index")
-    return d
-
-
-def _create_split_dict(tensor):
-    min_value = tensor.min()
-    max_value = tensor.max()
-    indices = torch.arange(
-        start=min_value + 1,
-        end=max_value + 1,
-        device=tensor.device,
-    )
-    split_dict = {i: {} for i in range(min_value, max_value + 1)}
-    return split_dict, indices
-
-
-def _get_renumber_map(df):
-    map = df["map"]
-    df.drop(columns=["map"], inplace=True)
-
-    map_starting_offset = map.iloc[0]
-    renumber_map = map[map_starting_offset:].dropna().reset_index(drop=True)
-    renumber_map_batch_indices = map[1 : map_starting_offset - 1].reset_index(drop=True)
-    renumber_map_batch_indices = renumber_map_batch_indices - map_starting_offset
-
-    map_end_offset = map_starting_offset + len(renumber_map)
-    # We only need to drop rows if the length of dataframe is determined by the map
-    # that is if map_length > sampled edges length
-    if map_end_offset == len(df):
-        df.dropna(axis=0, how="all", inplace=True)
-        df.reset_index(drop=True, inplace=True)
-
-    return df, cast_to_tensor(renumber_map), cast_to_tensor(renumber_map_batch_indices)
-
-
-def _get_tensor_d_from_sampled_df(df):
-    """
-    Converts a sampled cuDF DataFrame into a list of tensors.
-
-    Args:
-        df (cudf.DataFrame): The sampled cuDF DataFrame containing columns
-    Returns:
-        dict: A dictionary of tensors, keyed by batch_id and hop_id.
-    """
-    range_d = _get_source_destination_range(df)
-    df, renumber_map, renumber_map_batch_indices = _get_renumber_map(df)
-    batch_id_tensor = cast_to_tensor(df["batch_id"])
-    split_d, batch_indices = _create_split_dict(batch_id_tensor)
-    batch_split_indices = torch.searchsorted(batch_id_tensor, batch_indices).to("cpu")
-
-    for column in df.columns:
-        if column != "batch_id":
-            t = cast_to_tensor(df[column])
-            split_t = _split_tensor(t, batch_split_indices)
-            for bid, batch_t in zip(split_d.keys(), split_t):
-                split_d[bid][column] = batch_t
-
-    split_t = _split_tensor(renumber_map, renumber_map_batch_indices)
-    for bid, batch_t in zip(split_d.keys(), split_t):
-        split_d[bid]["map"] = batch_t
-    del df
-    result_tensor_d = {}
-    # Cache hop_split_d, hop_indices
-    hop_split_empty_d, hop_indices = None, None
-    for batch_id, batch_d in split_d.items():
-        hop_id_tensor = batch_d["hop_id"]
-        if hop_split_empty_d is None:
-            hop_split_empty_d, hop_indices = _create_split_dict(hop_id_tensor)
-
-        hop_split_d = {k: {} for k in hop_split_empty_d.keys()}
-        hop_split_indices = torch.searchsorted(hop_id_tensor, hop_indices).to("cpu")
-        for column, t in batch_d.items():
-            if column not in ["hop_id", "map"]:
-                split_t = _split_tensor(t, hop_split_indices)
-                for hid, ht in zip(hop_split_d.keys(), split_t):
-                    hop_split_d[hid][column] = ht
-        for hid in hop_split_d.keys():
-            hop_split_d[hid]["sources_range"] = range_d[(batch_id, hid)][
-                "sources_range"
-            ]
-            hop_split_d[hid]["destinations_range"] = range_d[(batch_id, hid)][
-                "destinations_range"
-            ]
-
-        result_tensor_d[batch_id] = hop_split_d
-        result_tensor_d[batch_id]["map"] = batch_d["map"]
-    return result_tensor_d
-
-
-def create_homogeneous_sampled_graphs_from_dataframe(
-    sampled_df: cudf.DataFrame,
-    edge_dir: str = "in",
-    return_type: str = "dgl.Block",
-):
-    """
-    This helper function creates DGL MFGS  for
-    homogeneous graphs from cugraph sampled dataframe
-
-    Args:
-        sampled_df (cudf.DataFrame): The sampled cuDF DataFrame containing
-            columns `sources`, `destinations`, `edge_id`, `batch_id` and
-            `hop_id`.
-        edge_dir (str): Direction of edges from samples
-    Returns:
-        list: A list containing three elements:
-            - input_nodes: The input nodes for the batch.
-            - output_nodes: The output nodes for the batch.
-            - graph_per_hop_ls: A list of DGL MFGS for each hop.
-    """
-    if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
-        raise ValueError(
-            "return_type must be either dgl.Block or cugraph_dgl.nn.SparseGraph"
-        )
-
-    result_tensor_d = _get_tensor_d_from_sampled_df(sampled_df)
-    del sampled_df
-    result_mfgs = [
-        _create_homogeneous_sampled_graphs_from_tensors_perhop(
-            tensors_batch_d, edge_dir, return_type
-        )
-        for tensors_batch_d in result_tensor_d.values()
-    ]
-    del result_tensor_d
-    return result_mfgs
-
-
-def _create_homogeneous_sampled_graphs_from_tensors_perhop(
-    tensors_batch_d, edge_dir, return_type
-):
-    """
-    This helper function creates sampled DGL MFGS for
-    homogeneous graphs from tensors per hop for a single
-    batch
-    Args:
-        tensors_batch_d (dict): A dictionary of tensors, keyed by hop_id.
-        edge_dir (str): Direction of edges from samples
-        metagraph (dgl.metagraph): The metagraph for the sampled graph
-        return_type (str): The type of graph to return
-    Returns:
-        tuple: A tuple of three elements:
-            - input_nodes: The input nodes for the batch.
-            - output_nodes: The output nodes for the batch.
-            - graph_per_hop_ls: A list of MFGS for each hop.
-    """
-    if edge_dir not in ["in", "out"]:
-        raise ValueError(f"Invalid edge_dir {edge_dir} provided")
-    if edge_dir == "out":
-        raise ValueError("Outwards edges not supported yet")
-    graph_per_hop_ls = []
-    seednodes_range = None
-    for hop_id, tensor_per_hop_d in tensors_batch_d.items():
-        if hop_id != "map":
-            if return_type == "dgl.Block":
-                mfg = _create_homogeneous_dgl_block_from_tensor_d(
-                    tensor_d=tensor_per_hop_d,
-                    renumber_map=tensors_batch_d["map"],
-                    seednodes_range=seednodes_range,
-                )
-            elif return_type == "cugraph_dgl.nn.SparseGraph":
-                mfg = _create_homogeneous_cugraph_dgl_nn_sparse_graph(
-                    tensor_d=tensor_per_hop_d, seednodes_range=seednodes_range
-                )
-            else:
-                raise ValueError(f"Invalid return_type {return_type} provided")
-            seednodes_range = max(
-                tensor_per_hop_d["sources_range"],
-                tensor_per_hop_d["destinations_range"],
-            )
-            graph_per_hop_ls.append(mfg)
-
-    # default DGL behavior
-    if edge_dir == "in":
-        graph_per_hop_ls.reverse()
-    if return_type == "dgl.Block":
-        input_nodes = graph_per_hop_ls[0].srcdata[dgl.NID]
-        output_nodes = graph_per_hop_ls[-1].dstdata[dgl.NID]
-    else:
-        map = tensors_batch_d["map"]
-        input_nodes = map[0 : graph_per_hop_ls[0].num_src_nodes()]
-        output_nodes = map[0 : graph_per_hop_ls[-1].num_dst_nodes()]
-    return input_nodes, output_nodes, graph_per_hop_ls
-
-
-def _create_homogeneous_dgl_block_from_tensor_d(
-    tensor_d,
-    renumber_map,
-    seednodes_range=None,
-):
-    rs = tensor_d["sources"]
-    rd = tensor_d["destinations"]
-    max_src_nodes = tensor_d["sources_range"]
-    max_dst_nodes = tensor_d["destinations_range"]
-    if seednodes_range is not None:
-        # If we have  vertices without outgoing edges, then
-        # sources can be missing from seednodes
-        # so we add them
-        # to ensure all the blocks are
-        # lined up correctly
-        max_dst_nodes = max(max_dst_nodes, seednodes_range)
-
-    data_dict = {("_N", "_E", "_N"): (rs, rd)}
-    num_src_nodes = {"_N": max_src_nodes + 1}
-    num_dst_nodes = {"_N": max_dst_nodes + 1}
-
-    block = dgl.create_block(
-        data_dict=data_dict, num_src_nodes=num_src_nodes, num_dst_nodes=num_dst_nodes
-    )
-    if "edge_id" in tensor_d:
-        block.edata[dgl.EID] = tensor_d["edge_id"]
-    # Below adds run time overhead
-    block.srcdata[dgl.NID] = renumber_map[0 : max_src_nodes + 1]
-    block.dstdata[dgl.NID] = renumber_map[0 : max_dst_nodes + 1]
-    return block
-
-
-def _create_homogeneous_cugraph_dgl_nn_sparse_graph(tensor_d, seednodes_range):
-    max_src_nodes = tensor_d["sources_range"]
-    max_dst_nodes = tensor_d["destinations_range"]
-    if seednodes_range is not None:
-        max_dst_nodes = max(max_dst_nodes, seednodes_range)
-    size = (max_src_nodes + 1, max_dst_nodes + 1)
-    sparse_graph = cugraph_dgl.nn.SparseGraph(
-        size=size,
-        src_ids=tensor_d["sources"],
-        dst_ids=tensor_d["destinations"],
-        formats=["csc"],
-        reduce_memory=True,
-    )
-    return sparse_graph
-
-
-def create_heterogeneous_sampled_graphs_from_dataframe(
-    sampled_df: cudf.DataFrame,
-    num_nodes_dict: Dict[str, int],
-    etype_id_dict: Dict[int, Tuple[str, str, str]],
-    etype_offset_dict: Dict[Tuple[str, str, str], int],
-    ntype_offset_dict: Dict[str, int],
-    edge_dir: str = "in",
-):
-    """
-    This helper function creates DGL MFGS from cugraph sampled dataframe
-    """
-    sampled_df["batch_id"] = sampled_df["batch_id"] - sampled_df["batch_id"].min()
-    result_df_ls = sampled_df[
-        ["sources", "destinations", "edge_id", "hop_id", "edge_type"]
-    ].scatter_by_map(sampled_df["batch_id"], keep_index=False)
-    del sampled_df
-
-    result_df_ls = [
-        batch_df[["sources", "destinations", "edge_id", "edge_type"]].scatter_by_map(
-            batch_df["hop_id"], keep_index=False
-        )
-        for batch_df in result_df_ls
-    ]
-
-    result_tensor_ls = [
-        [
-            _get_edges_dict_from_perhop_df(
-                h_df, etype_id_dict, etype_offset_dict, ntype_offset_dict
-            )
-            for h_df in per_batch_ls
-        ]
-        for per_batch_ls in result_df_ls
-    ]
-    del result_df_ls
-
-    result_mfgs = [
-        _create_heterogenous_sampled_graphs_from_tensors_perhop(
-            tensors_perhop_ls, num_nodes_dict, edge_dir
-        )
-        for tensors_perhop_ls in result_tensor_ls
-    ]
-    return result_mfgs
-
-
-def _get_edges_dict_from_perhop_df(
-    df, etype_id_dict, etype_offset_dict, ntype_offset_dict
-):
-    # Optimize below function
-    # based on _get_tensor_ls_from_sampled_df
-    edges_per_type_ls = df[["sources", "destinations", "edge_id"]].scatter_by_map(
-        df["edge_type"], map_size=len(etype_id_dict), keep_index=False
-    )
-    del df
-    per_type_df_d = {etype_id_dict[i]: df for i, df in enumerate(edges_per_type_ls)}
-    del edges_per_type_ls
-    # reverse src,dst here
-    per_type_tensor_d = {
-        etype: (
-            cast_to_tensor(etype_df["sources"]) - ntype_offset_dict[etype[0]],
-            cast_to_tensor(etype_df["destinations"]) - ntype_offset_dict[etype[2]],
-            cast_to_tensor(etype_df["edge_id"]) - etype_offset_dict[etype],
-        )
-        for etype, etype_df in per_type_df_d.items()
-    }
-    return per_type_tensor_d
-
-
-def _create_heterogenous_sampled_graphs_from_tensors_perhop(
-    tensors_perhop_ls, num_nodes_dict, edge_dir
-):
-    if edge_dir not in ["in", "out"]:
-        raise ValueError(f"Invalid edge_dir {edge_dir} provided")
-    if edge_dir == "out":
-        raise ValueError("Outwards edges not supported yet")
-    graph_per_hop_ls = []
-    output_nodes = None
-
-    seed_nodes = None
-    for hop_edges_dict in tensors_perhop_ls:
-        block = create_heterogenous_dgl_block_from_tensors_dict(
-            hop_edges_dict, num_nodes_dict, seed_nodes
-        )
-        seed_nodes = block.srcdata[dgl.NID]
-        if output_nodes is None:
-            output_nodes = block.dstdata[dgl.NID]
-        graph_per_hop_ls.append(block)
-
-    # default DGL behavior
-    if edge_dir == "in":
-        graph_per_hop_ls.reverse()
-    return seed_nodes, output_nodes, graph_per_hop_ls
-
-
-def create_heterogenous_dgl_block_from_tensors_dict(
-    edges_dict: Dict[Tuple(str, str, str), (torch.Tensor, torch.Tensor, torch.Tensor)],
-    num_nodes_dict: Dict[str, torch.Tensor],
-    seed_nodes: Optional[Dict[str, torch.Tensor]],
-):
-    data_dict = {k: (s, d) for k, (s, d, _) in edges_dict.items()}
-    edge_ids_dict = {k: eid for k, (_, _, eid) in edges_dict.items()}
-
-    sampled_graph = dgl.heterograph(
-        data_dict=data_dict,
-        num_nodes_dict=num_nodes_dict,
-    )
-    sampled_graph.edata[dgl.EID] = edge_ids_dict
-
-    src_d = defaultdict(list)
-    dst_d = defaultdict(list)
-
-    for (s, _, d), (src_id, dst_id) in data_dict.items():
-        src_d[s].append(src_id)
-        dst_d[d].append(dst_id)
-
-    src_d = {k: torch.cat(v).unique() for k, v in src_d.items() if len(v) > 0}
-    if seed_nodes is None:
-        seed_nodes = {k: torch.cat(v).unique() for k, v in dst_d.items() if len(v) > 0}
-
-    block = dgl.to_block(sampled_graph, dst_nodes=seed_nodes, src_nodes=src_d)
-    block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
-    return block
-
-
-def _process_sampled_tensors_csc(
-    tensors: Dict["torch.Tensor"],
-    reverse_hop_id: bool = True,
-) -> Tuple[
-    Dict[int, Dict[int, Dict[str, "torch.Tensor"]]],
-    List["torch.Tensor"],
-    List[List[int, int]],
-]:
-    """
-    Convert tensors generated by BulkSampler to a dictionary of tensors, to
-    facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
-
-    Parameters
-    ----------
-    tensors: Dict[torch.Tensor]
-        The output from BulkSampler compressed in CSC format. The dataframe
-        should be generated with `compression="CSR"` in BulkSampler,
-        since the sampling routine treats seed nodes as sources.
-
-    reverse_hop_id: bool (default=True)
-        Reverse hop id.
-
-    Returns
-    -------
-    tensors_dict: dict
-        A nested dictionary keyed by batch id and hop id.
-        `tensor_dict[batch_id][hop_id]` holds "minors" and "major_offsets"
-        values for CSC MFGs.
-
-    renumber_map_list: list
-        List of renumbering maps for looking up global indices of nodes. One
-        map for each batch.
-
-    mfg_sizes: list
-        List of the number of nodes in each message passing layer. For the
-        k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
-        destinations, respectively.
-    """
-
-    major_offsets = tensors["major_offsets"]
-    minors = tensors["minors"]
-    label_hop_offsets = tensors["label_hop_offsets"]
-    renumber_map = tensors["map"]
-    renumber_map_offsets = tensors["renumber_map_offsets"]
-
-    n_batches = len(renumber_map_offsets) - 1
-    n_hops = int((len(label_hop_offsets) - 1) / n_batches)
-
-    # make global offsets local
-    # Have to make a clone as pytorch does not allow
-    # in-place operations on tensors
-    major_offsets -= major_offsets[0].clone()
-    label_hop_offsets -= label_hop_offsets[0].clone()
-    renumber_map_offsets -= renumber_map_offsets[0].clone()
-
-    # get the sizes of each adjacency matrix (for MFGs)
-    mfg_sizes = (label_hop_offsets[1:] - label_hop_offsets[:-1]).reshape(
-        (n_batches, n_hops)
-    )
-    n_nodes = renumber_map_offsets[1:] - renumber_map_offsets[:-1]
-    mfg_sizes = torch.hstack((mfg_sizes, n_nodes.reshape(n_batches, -1)))
-    if reverse_hop_id:
-        mfg_sizes = mfg_sizes.flip(1)
-
-    tensors_dict = {}
-    renumber_map_list = []
-    # Note: minors and major_offsets from BulkSampler are of type int32
-    # and int64 respectively. Since pylibcugraphops binding code doesn't
-    # support distinct node and edge index type, we simply casting both
-    # to int32 for now.
-    minors = minors.int()
-    major_offsets = major_offsets.int()
-    # Note: We transfer tensors to CPU here to avoid the overhead of
-    # transferring them in each iteration of the for loop below.
-    major_offsets_cpu = major_offsets.to("cpu").numpy()
-    label_hop_offsets_cpu = label_hop_offsets.to("cpu").numpy()
-
-    for batch_id in range(n_batches):
-        batch_dict = {}
-        for hop_id in range(n_hops):
-            hop_dict = {}
-            idx = batch_id * n_hops + hop_id  # idx in label_hop_offsets
-            major_offsets_start = label_hop_offsets_cpu[idx]
-            major_offsets_end = label_hop_offsets_cpu[idx + 1]
-            minors_start = major_offsets_cpu[major_offsets_start]
-            minors_end = major_offsets_cpu[major_offsets_end]
-            hop_dict["minors"] = minors[minors_start:minors_end]
-            hop_dict["major_offsets"] = (
-                major_offsets[major_offsets_start : major_offsets_end + 1]
-                - major_offsets[major_offsets_start]
-            )
-            if reverse_hop_id:
-                batch_dict[n_hops - 1 - hop_id] = hop_dict
-            else:
-                batch_dict[hop_id] = hop_dict
-
-        tensors_dict[batch_id] = batch_dict
-
-        renumber_map_list.append(
-            renumber_map[
-                renumber_map_offsets[batch_id] : renumber_map_offsets[batch_id + 1]
-            ],
-        )
-
-    return tensors_dict, renumber_map_list, mfg_sizes.tolist()
-
-
-def _process_sampled_df_csc(
-    df: cudf.DataFrame,
-    reverse_hop_id: bool = True,
-):
-    """
-    Convert a dataframe generated by BulkSampler to a dictionary of tensors, to
-    facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
-
-    Parameters
-    ----------
-    df: cudf.DataFrame
-        The output from BulkSampler compressed in CSC format. The dataframe
-        should be generated with `compression="CSR"` in BulkSampler,
-        since the sampling routine treats seed nodes as sources.
-
-    reverse_hop_id: bool (default=True)
-        Reverse hop id.
-
-    Returns
-    -------
-    tensors_dict: dict
-        A nested dictionary keyed by batch id and hop id.
-        `tensor_dict[batch_id][hop_id]` holds "minors" and "major_offsets"
-        values for CSC MFGs.
-
-    renumber_map_list: list
-        List of renumbering maps for looking up global indices of nodes. One
-        map for each batch.
-
-    mfg_sizes: list
-        List of the number of nodes in each message passing layer. For the
-        k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
-        destinations, respectively.
-    """
-
-    return _process_sampled_tensors_csc(
-        {
-            "major_offsets": cast_to_tensor(df.major_offsets.dropna()),
-            "label_hop_offsets": cast_to_tensor(df.label_hop_offsets.dropna()),
-            "renumber_map_offsets": cast_to_tensor(df.renumber_map_offsets.dropna()),
-            "map": cast_to_tensor(df["map"].dropna()),
-            "minors": cast_to_tensor(df.minors.dropna()),
-        },
-        reverse_hop_id=reverse_hop_id,
-    )
-
-
-def _create_homogeneous_blocks_from_csc(
-    tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
-    renumber_map_list: List[torch.Tensor],
-    mfg_sizes: List[int, int],
-):
-    """Create mini-batches of MFGs in the dgl.Block format.
-    The input arguments are the outputs of
-    the function `_process_sampled_df_csc`.
-
-    Returns
-    -------
-    output: list
-        A list of mini-batches. Each mini-batch is a list that consists of
-        `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
-    """
-    n_batches, n_hops = len(mfg_sizes), len(mfg_sizes[0]) - 1
-    output = []
-    for b_id in range(n_batches):
-        output_batch = []
-        output_batch.append(renumber_map_list[b_id])
-        output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
-
-        mfgs = [
-            SparseGraph(
-                size=(mfg_sizes[b_id][h_id], mfg_sizes[b_id][h_id + 1]),
-                src_ids=tensors_dict[b_id][h_id]["minors"],
-                cdst_ids=tensors_dict[b_id][h_id]["major_offsets"],
-                formats=["csc", "coo"],
-                reduce_memory=True,
-            )
-            for h_id in range(n_hops)
-        ]
-
-        blocks = []
-        seednodes_range = None
-        for mfg in reversed(mfgs):
-            block_mfg = _create_homogeneous_dgl_block_from_tensor_d(
-                {
-                    "sources": mfg.src_ids(),
-                    "destinations": mfg.dst_ids(),
-                    "sources_range": mfg._num_src_nodes - 1,
-                    "destinations_range": mfg._num_dst_nodes - 1,
-                },
-                renumber_map=renumber_map_list[b_id],
-                seednodes_range=seednodes_range,
-            )
-
-            seednodes_range = max(
-                mfg._num_src_nodes - 1,
-                mfg._num_dst_nodes - 1,
-            )
-            blocks.append(block_mfg)
-        del mfgs
-
-        blocks.reverse()
-
-        output_batch.append(blocks)
-
-        output.append(output_batch)
-    return output
-
-
-def _create_homogeneous_sparse_graphs_from_csc(
-    tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
-    renumber_map_list: List[torch.Tensor],
-    mfg_sizes: List[int, int],
-) -> List[List[torch.Tensor, torch.Tensor, List[SparseGraph]]]:
-    """Create mini-batches of MFGs. The input arguments are the outputs of
-    the function `_process_sampled_df_csc`.
-
-    Returns
-    -------
-    output: list
-        A list of mini-batches. Each mini-batch is a list that consists of
-        `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
-    """
-    n_batches, n_hops = len(mfg_sizes), len(mfg_sizes[0]) - 1
-    output = []
-    for b_id in range(n_batches):
-        output_batch = []
-        output_batch.append(renumber_map_list[b_id])
-        output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
-        mfgs = [
-            SparseGraph(
-                size=(mfg_sizes[b_id][h_id], mfg_sizes[b_id][h_id + 1]),
-                src_ids=tensors_dict[b_id][h_id]["minors"],
-                cdst_ids=tensors_dict[b_id][h_id]["major_offsets"],
-                formats=["csc"],
-                reduce_memory=True,
-            )
-            for h_id in range(n_hops)
-        ]
-
-        output_batch.append(mfgs)
-
-        output.append(output_batch)
-
-    return output
-
-
-def create_homogeneous_sampled_graphs_from_dataframe_csc(
-    sampled_df: cudf.DataFrame, output_format: str = "cugraph_dgl.nn.SparseGraph"
-):
-    """Public API to create mini-batches of MFGs using a dataframe output by
-    BulkSampler, where the sampled graph is compressed in CSC format."""
-    if output_format == "cugraph_dgl.nn.SparseGraph":
-        return _create_homogeneous_sparse_graphs_from_csc(
-            *(_process_sampled_df_csc(sampled_df)),
-        )
-    elif output_format == "dgl.Block":
-        return _create_homogeneous_blocks_from_csc(
-            *(_process_sampled_df_csc(sampled_df)),
-        )
-    else:
-        raise ValueError(f"Invalid output format {output_format}")
-
-
-def create_homogeneous_sampled_graphs_from_tensors_csc(
-    tensors: Dict["torch.Tensor"], output_format: str = "cugraph_dgl.nn.SparseGraph"
-):
-    """Public API to create mini-batches of MFGs using a dataframe output by
-    BulkSampler, where the sampled graph is compressed in CSC format."""
-    if output_format == "cugraph_dgl.nn.SparseGraph":
-        return _create_homogeneous_sparse_graphs_from_csc(
-            *(_process_sampled_tensors_csc(tensors)),
-        )
-    elif output_format == "dgl.Block":
-        return _create_homogeneous_blocks_from_csc(
-            *(_process_sampled_tensors_csc(tensors)),
-        )
-    else:
-        raise ValueError(f"Invalid output format {output_format}")
diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
deleted file mode 100644
index 9dc009f4127..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/features.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-wgth = import_optional("pylibwholegraph.torch")
-
-
-class WholeFeatureStore(
-    object if isinstance(dgl, MissingModule) else dgl.storages.base.FeatureStorage
-):
-    """
-    Interface for feature storage.
-    """
-
-    def __init__(
-        self,
-        tensor: "torch.Tensor",
-        memory_type: str = "distributed",
-        location: str = "cpu",
-    ):
-        """
-        Constructs a new WholeFeatureStore object that wraps a WholeGraph wholememory
-        distributed tensor.
-
-        Parameters
-        ----------
-        t: torch.Tensor
-            The local slice of the tensor being distributed.  These should be in order
-            by rank (i.e. rank 0 contains elements 0-9, rank 1 contains elements 10-19,
-            rank 3 contains elements 20-29, etc.)  The sizes do not need to be equal.
-        memory_type: str (optional, default='distributed')
-            The memory type of this store.  Options are
-            'distributed', 'chunked', and 'continuous'.
-            For more information consult the WholeGraph
-            documentation.
-        location: str(optional, default='cpu')
-            The location ('cpu' or 'cuda') where data is stored.
-        """
-        self.__wg_comm = wgth.get_global_communicator()
-
-        if len(tensor.shape) > 2:
-            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
-
-        rank = torch.distributed.get_rank()
-        world_size = torch.distributed.get_world_size()
-
-        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
-        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
-        torch.distributed.all_gather_into_tensor(sizes, ld)
-
-        sizes = sizes.cpu()
-        ld = sizes.sum()
-
-        self.__td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
-        global_shape = [
-            int(ld),
-            self.__td if self.__td > 0 else 1,
-        ]
-
-        if self.__td < 0:
-            tensor = tensor.reshape((tensor.shape[0], 1))
-
-        wg_tensor = wgth.create_wholememory_tensor(
-            self.__wg_comm,
-            memory_type,
-            location,
-            global_shape,
-            tensor.dtype,
-            [global_shape[1], 1],
-        )
-
-        offset = sizes[:rank].sum() if rank > 0 else 0
-
-        wg_tensor.scatter(
-            tensor.clone(memory_format=torch.contiguous_format).cuda(),
-            torch.arange(
-                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
-            ).contiguous(),
-        )
-
-        self.__wg_comm.barrier()
-
-        self.__wg_tensor = wg_tensor
-
-    def requires_ddp(self) -> bool:
-        return True
-
-    def fetch(
-        self,
-        indices: torch.Tensor,
-        device: torch.cuda.Device,
-        pin_memory=False,
-        **kwargs,
-    ):
-        if pin_memory:
-            warnings.warn("pin_memory has no effect for WholeFeatureStorage.")
-
-        t = self.__wg_tensor.gather(
-            indices.cuda(),
-            force_dtype=self.__wg_tensor.dtype,
-        )
-
-        if self.__td < 0:
-            t = t.reshape((t.shape[0],))
-
-        return t.to(torch.device(device))
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
deleted file mode 100644
index 88b93656fa8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ /dev/null
@@ -1,931 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Optional, Dict, Tuple, List
-
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import cugraph_comms_get_raft_handle
-
-import cupy
-import pylibcugraph
-
-from cugraph_dgl.typing import TensorType
-from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
-from cugraph_dgl.features import WholeFeatureStore
-from cugraph_dgl.view import (
-    HeteroNodeView,
-    HeteroNodeDataView,
-    HeteroEdgeView,
-    HeteroEdgeDataView,
-    EmbeddingView,
-)
-
-
-# Have to use import_optional even though these are required
-# dependencies in order to build properly.
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-tensordict = import_optional("tensordict")
-
-HOMOGENEOUS_NODE_TYPE = "n"
-HOMOGENEOUS_EDGE_TYPE = (HOMOGENEOUS_NODE_TYPE, "e", HOMOGENEOUS_NODE_TYPE)
-
-
-class Graph:
-    """
-    cuGraph-backed duck-typed version of dgl.DGLGraph that distributes
-    the graph across workers.  This object uses lazy graph creation.
-    Users can repeatedly call add_edges, and the tensors won't
-    be converted into a cuGraph graph until one is needed
-    (i.e. when creating a loader). Supports
-    single-node/single-GPU, single-node/multi-GPU, and
-    multi-node/multi-GPU graph storage.
-
-    Each worker should have a slice of the graph locally, and
-    call put_edge_index with its slice.
-    """
-
-    def __init__(
-        self,
-        is_multi_gpu: bool = False,
-        ndata_storage="torch",
-        edata_storage="torch",
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-        is_multi_gpu: bool (optional, default=False)
-            Specifies whether this graph is distributed across GPUs.
-        ndata_storage: str (optional, default='torch')
-            Specifies where node data should be stored
-            (options are 'torch' and 'wholegraph').
-            If using PyTorch tensors for storage ('torch')
-            then data will be replicated across workers and data
-            for all nodes should be provided when calling add_nodes.
-            If using WholeGraph wholememory tensors for storage,
-            then data will be distributed across workers and only
-            the local slice of the data should be provided when
-            calling add_nodes.
-        edata_storage: str (optional, default='torch')
-            If using PyTorch tensors for storage ('torch')
-            then data will be replicated across workers and data
-            for all nodes should be provided when calling add_edge.
-            If using WholeGraph wholememory tensors for storage,
-            then data will be distributed across workers and only
-            the local slice of the data should be provided when
-            calling add_edges.
-        kwargs:
-            Optional kwargs for WholeGraph feature storage.
-        """
-
-        if ndata_storage not in ("torch", "wholegraph"):
-            raise ValueError(
-                "Invalid node storage type (valid types are 'torch' and 'wholegraph')"
-            )
-        if edata_storage not in ("torch", "wholegraph"):
-            raise ValueError(
-                "Invalid edge storage type (valid types are 'torch' and 'wholegraph')"
-            )
-
-        self.__num_nodes_dict = {}
-        self.__num_edges_dict = {}
-        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
-
-        self.__graph = None
-        self.__vertex_offsets = None
-        self.__handle = None
-        self.__is_multi_gpu = is_multi_gpu
-
-        self.__ndata_storage_type = (
-            WholeFeatureStore
-            if ndata_storage == "wholegraph"
-            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
-        )
-        self.__edata_storage_type = (
-            WholeFeatureStore
-            if edata_storage == "wholegraph"
-            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
-        )
-        self.__ndata_storage = {}
-        self.__edata_storage = {}
-        self.__wg_kwargs = kwargs
-
-    @property
-    def is_multi_gpu(self):
-        return self.__is_multi_gpu
-
-    def to_canonical_etype(
-        self, etype: Union[str, Tuple[str, str, str]]
-    ) -> Tuple[str, str, str]:
-        if etype is None:
-            if len(self.canonical_etypes) > 1:
-                raise ValueError("Edge type is required for heterogeneous graphs.")
-            return HOMOGENEOUS_EDGE_TYPE
-
-        if isinstance(etype, tuple) and len(etype) == 3:
-            return etype
-
-        for src_type, rel_type, dst_type in self.__edge_indices.keys(
-            leaves_only=True, include_nested=True
-        ):
-            if etype == rel_type:
-                return (src_type, rel_type, dst_type)
-
-        raise ValueError("Unknown relation type " + etype)
-
-    def add_nodes(
-        self,
-        global_num_nodes: int,
-        data: Optional[Dict[str, TensorType]] = None,
-        ntype: Optional[str] = None,
-    ):
-        """
-        Adds the given number of nodes to this graph.  Can only be called once
-        per node type. The number of nodes specified here refers to the total
-        number of nodes across all workers (the entire graph). If the backing
-        feature store is distributed (i.e. wholegraph), then only local features
-        should be passed to the data argument.  If the backing feature store is
-        replicated, then features for all nodes in the graph should be passed to
-        the data argument, including those for nodes not on the local worker.
-
-        Parameters
-        ----------
-        global_num_nodes: int
-            The total number of nodes of the given type in this graph.
-            The same number should be passed to every worker.
-        data: Dict[str, TensorType] (optional, default=None)
-            Node feature tensors.
-        ntype: str (optional, default=None)
-            The node type being modified.  Required for heterogeneous graphs.
-        """
-        if ntype is None:
-            if len(self.__num_nodes_dict.keys()) > 1:
-                raise ValueError("Node type is required for heterogeneous graphs.")
-            ntype = HOMOGENEOUS_NODE_TYPE
-
-        if ntype in self.__num_nodes_dict:
-            raise ValueError(
-                "Calling add_nodes multiple types for the same "
-                "node type is not allowed in cuGraph-DGL"
-            )
-
-        if self.is_multi_gpu:
-            # Ensure all nodes got the same number of nodes passed
-            world_size = torch.distributed.get_world_size()
-            local_size = torch.tensor(
-                [global_num_nodes], device="cuda", dtype=torch.int64
-            )
-            ns = torch.empty((world_size,), device="cuda", dtype=torch.int64)
-            torch.distributed.all_gather_into_tensor(ns, local_size)
-            if not (ns == global_num_nodes).all():
-                raise ValueError("The global number of nodes must match on all workers")
-
-            # Ensure the sum of the feature shapes equals the global number of nodes.
-            if data is not None:
-                for feature_name, feature_tensor in data.items():
-                    features_size = torch.tensor(
-                        [int(feature_tensor.shape[0])], device="cuda", dtype=torch.int64
-                    )
-                    torch.distributed.all_reduce(
-                        features_size, op=torch.distributed.ReduceOp.SUM
-                    )
-                    if features_size != global_num_nodes:
-                        raise ValueError(
-                            "The total length of the feature vector across workers must"
-                            " match the global number of nodes but it does not "
-                            f"match for {feature_name}."
-                        )
-
-        self.__num_nodes_dict[ntype] = global_num_nodes
-
-        if data is not None:
-            for feature_name, feature_tensor in data.items():
-                self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
-                    _cast_to_torch_tensor(feature_tensor), **self.__wg_kwargs
-                )
-
-        self.__graph = None
-        self.__vertex_offsets = None
-
-    def __check_node_ids(self, ntype: str, ids: TensorType):
-        """
-        Ensures all node ids in the provided id tensor are valid.
-        Raises a ValueError if any are invalid.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type being validated against.
-        ids:
-            The tensor of ids being validated.
-        """
-        if ntype in self.__num_nodes_dict:
-            if ids.max() + 1 > self.num_nodes(ntype):
-                raise ValueError(
-                    f"input tensor contains invalid node ids for type {ntype}"
-                )
-        else:
-            raise ValueError(
-                f"add_nodes() must be called for type {ntype} before calling num_edges."
-            )
-
-    def add_edges(
-        self,
-        u: TensorType,
-        v: TensorType,
-        data: Optional[Dict[str, TensorType]] = None,
-        etype: Optional[Union[str, Tuple[str, str, str]]] = None,
-    ) -> None:
-        """
-        Adds edges to this graph.  Must be called after add_nodes
-        is called for the src/dst node type. If the backing feature
-        store is distributed (i.e. wholegraph), then only local
-        features should be passed to the data argument.  If the
-        backing feature store is replicated, then features for
-        all edges should be passed to the data argument,
-        including those for edges not on the local worker.
-
-        Parameters
-        ----------
-        u: TensorType
-            1d tensor of source node ids (local slice of the distributed edgelist).
-        v: TensorType
-            1d tensor of destination node ids (local slice of the distributed edgelist).
-        data: Dict[str, TensorType] (optional, default=None)
-            Dictionary containing edge features for the new edges.
-        etype: Union[str, Tuple[str, str, str]]
-            The edge type of the edges being inserted.  Not required
-            for homogeneous graphs, which have only one edge type.
-        """
-
-        # Validate all inputs before proceeding
-        # The number of nodes for the src/dst type needs to be known and there cannot
-        # be any edges of this type in the graph.
-        dgl_can_edge_type = self.to_canonical_etype(etype)
-        src_type, _, dst_type = dgl_can_edge_type
-        if dgl_can_edge_type in self.__edge_indices.keys(
-            leaves_only=True, include_nested=True
-        ):
-            raise ValueError(
-                "This cuGraph-DGL graph already contains edges of type"
-                f" {dgl_can_edge_type}. Calling add_edges multiple times"
-                " for the same edge type is not supported."
-            )
-        self.__check_node_ids(src_type, u)
-        self.__check_node_ids(dst_type, v)
-
-        self.__edge_indices[dgl_can_edge_type] = torch.stack(
-            [
-                _cast_to_torch_tensor(u),
-                _cast_to_torch_tensor(v),
-            ]
-        ).to(self.idtype)
-
-        if data is not None:
-            for attr_name, attr_tensor in data.items():
-                self.__edata_storage[
-                    dgl_can_edge_type, attr_name
-                ] = self.__edata_storage_type(
-                    _cast_to_torch_tensor(attr_tensor), **self.__wg_kwargs
-                )
-
-        num_edges = self.__edge_indices[dgl_can_edge_type].shape[1]
-        if self.is_multi_gpu:
-            num_edges = torch.tensor([num_edges], device="cuda", dtype=torch.int64)
-            torch.distributed.all_reduce(num_edges, op=torch.distributed.ReduceOp.SUM)
-
-        self.__num_edges_dict[dgl_can_edge_type] = int(num_edges)
-
-        self.__graph = None
-        self.__vertex_offsets = None
-
-    def num_nodes(self, ntype: Optional[str] = None) -> int:
-        """
-        Returns the number of nodes of ntype, or if ntype is not provided,
-        the total number of nodes in the graph.
-        """
-        if ntype is None:
-            return sum(self.__num_nodes_dict.values())
-
-        return self.__num_nodes_dict[ntype]
-
-    def number_of_nodes(self, ntype: Optional[str] = None) -> int:
-        """
-        Alias for num_nodes.
-        """
-        return self.num_nodes(ntype=ntype)
-
-    def num_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
-        """
-        Returns the number of edges of etype, or if etype is not provided,
-        the total number of edges in the graph.
-        """
-        if etype is None:
-            return sum(self.__num_edges_dict.values())
-
-        etype = self.to_canonical_etype(etype)
-        return self.__num_edges_dict[etype]
-
-    def number_of_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
-        """
-        Alias for num_edges.
-        """
-        return self.num_edges(etype=etype)
-
-    @property
-    def ntypes(self) -> List[str]:
-        """
-        Returns the node type names in this graph.
-        """
-        return list(self.__num_nodes_dict.keys())
-
-    @property
-    def etypes(self) -> List[str]:
-        """
-        Returns the edge type names in this graph
-        (the second element of the canonical edge
-        type tuple).
-        """
-        return [et[1] for et in self.__num_edges_dict.keys()]
-
-    @property
-    def canonical_etypes(self) -> List[str]:
-        """
-        Returns the canonical edge type names in this
-        graph.
-        """
-        return list(self.__num_edges_dict.keys())
-
-    @property
-    def _vertex_offsets(self) -> Dict[str, int]:
-        if self.__vertex_offsets is None:
-            ordered_keys = sorted(list(self.ntypes))
-            self.__vertex_offsets = {}
-            offset = 0
-            for vtype in ordered_keys:
-                self.__vertex_offsets[vtype] = offset
-                offset += self.num_nodes(vtype)
-
-        return dict(self.__vertex_offsets)
-
-    def __get_edgelist(self, prob_attr=None) -> Dict[str, "torch.Tensor"]:
-        """
-        This function always returns src/dst labels with respect
-        to the out direction.
-
-        Returns
-        -------
-        Dict[str, torch.Tensor] with the following keys:
-            src: source vertices (int64)
-                Note that src is the 1st element of the DGL edge index.
-            dst: destination vertices (int64)
-                Note that dst is the 2nd element of the DGL edge index.
-            eid: edge ids for each edge (int64)
-                Note that these start from 0 for each edge type.
-            etp: edge types for each edge (int32)
-                Note that these are in lexicographic order.
-        """
-        sorted_keys = sorted(
-            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
-        )
-
-        # note that this still follows the DGL convention of (src, rel, dst)
-        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
-        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
-        # and (paper 1) -> (author 0)
-        edge_index = torch.concat(
-            [
-                torch.stack(
-                    [
-                        self.__edge_indices[src_type, rel_type, dst_type][0]
-                        + self._vertex_offsets[src_type],
-                        self.__edge_indices[src_type, rel_type, dst_type][1]
-                        + self._vertex_offsets[dst_type],
-                    ]
-                )
-                for (src_type, rel_type, dst_type) in sorted_keys
-            ],
-            axis=1,
-        ).cuda()
-
-        edge_type_array = torch.arange(
-            len(sorted_keys), dtype=torch.int32, device="cuda"
-        ).repeat_interleave(
-            torch.tensor(
-                [self.__edge_indices[et].shape[1] for et in sorted_keys],
-                device="cuda",
-                dtype=torch.int32,
-            )
-        )
-
-        num_edges_t = torch.tensor(
-            [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
-        )
-
-        if self.is_multi_gpu:
-            rank = torch.distributed.get_rank()
-            world_size = torch.distributed.get_world_size()
-
-            num_edges_all_t = torch.empty(
-                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
-            )
-            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
-
-            start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
-
-        else:
-            rank = 0
-            start_offsets = torch.zeros(
-                (len(sorted_keys),), dtype=torch.int64, device="cuda"
-            )
-            num_edges_all_t = num_edges_t.reshape((1, num_edges_t.numel()))
-
-        # Use pinned memory here for fast access to CPU/WG storage
-        edge_id_array_per_type = [
-            torch.arange(
-                start_offsets[i],
-                start_offsets[i] + num_edges_all_t[rank][i],
-                dtype=torch.int64,
-                device="cpu",
-            ).pin_memory()
-            for i in range(len(sorted_keys))
-        ]
-
-        # Retrieve the weights from the appropriate feature(s)
-        # DGL implicitly requires all edge types use the same
-        # feature name.
-        if prob_attr is None:
-            weights = None
-        else:
-            if len(sorted_keys) > 1:
-                weights = torch.concat(
-                    [
-                        self.edata[prob_attr][sorted_keys[i]][ix]
-                        for i, ix in enumerate(edge_id_array_per_type)
-                    ]
-                )
-            else:
-                weights = self.edata[prob_attr][edge_id_array_per_type[0]]
-
-        # Safe to move this to cuda because the consumer will always
-        # move it to cuda if it isn't already there.
-        edge_id_array = torch.concat(edge_id_array_per_type).cuda()
-
-        edgelist_dict = {
-            "src": edge_index[0],
-            "dst": edge_index[1],
-            "etp": edge_type_array,
-            "eid": edge_id_array,
-        }
-
-        if weights is not None:
-            edgelist_dict["wgt"] = weights
-
-        return edgelist_dict
-
-    @property
-    def is_homogeneous(self):
-        return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <= 1
-
-    @property
-    def idtype(self):
-        return torch.int64
-
-    @property
-    def _resource_handle(self):
-        if self.__handle is None:
-            if self.is_multi_gpu:
-                self.__handle = pylibcugraph.ResourceHandle(
-                    cugraph_comms_get_raft_handle().getHandle()
-                )
-            else:
-                self.__handle = pylibcugraph.ResourceHandle()
-        return self.__handle
-
-    def _graph(
-        self,
-        direction: str,
-        prob_attr: Optional[str] = None,
-    ) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
-        """
-        Gets the pylibcugraph Graph object with edges pointing in the given direction
-        (i.e. 'out' is standard, 'in' is reverse).
-        """
-
-        if direction not in ["out", "in"]:
-            raise ValueError(f"Invalid direction {direction} (expected 'in' or 'out').")
-
-        graph_properties = pylibcugraph.GraphProperties(
-            is_multigraph=True, is_symmetric=False
-        )
-
-        if self.__graph is not None:
-            if (
-                self.__graph["direction"] != direction
-                or self.__graph["prob_attr"] != prob_attr
-            ):
-                self.__graph = None
-
-        if self.__graph is None:
-            src_col, dst_col = ("src", "dst") if direction == "out" else ("dst", "src")
-            edgelist_dict = self.__get_edgelist(prob_attr=prob_attr)
-
-            if self.is_multi_gpu:
-                rank = torch.distributed.get_rank()
-                world_size = torch.distributed.get_world_size()
-
-                vertices_array = cupy.arange(self.num_nodes(), dtype="int64")
-                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
-
-                graph = pylibcugraph.MGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
-                    [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
-                    vertices_array=[vertices_array],
-                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
-                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
-                    weight_array=[cupy.asarray(edgelist_dict["wgt"])]
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-            else:
-                graph = pylibcugraph.SGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    cupy.asarray(edgelist_dict[src_col]).astype("int64"),
-                    cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
-                    vertices_array=cupy.arange(self.num_nodes(), dtype="int64"),
-                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
-                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
-                    weight_array=cupy.asarray(edgelist_dict["wgt"])
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-
-        self.__graph = {"graph": graph, "direction": direction, "prob_attr": prob_attr}
-
-        return self.__graph["graph"]
-
-    def _has_n_emb(self, ntype: str, emb_name: str) -> bool:
-        return (ntype, emb_name) in self.__ndata_storage
-
-    def _get_n_emb(
-        self, ntype: Union[str, None], emb_name: str, u: Union[str, TensorType]
-    ) -> Union["torch.Tensor", "EmbeddingView"]:
-        """
-        Gets the embedding of a single node type.
-        Unlike DGL, this function takes the string node
-        type name instead of an integer id.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type to get the embedding of.
-        emb_name: str
-            The embedding name of the embedding to get.
-        u: Union[str, TensorType]
-            Nodes to get the representation of, or ALL
-            to get the representation of all nodes of
-            the given type (returns embedding view).
-
-        Returns
-        -------
-        Union[torch.Tensor, cugraph_dgl.view.EmbeddingView]
-            The embedding of the given edge type with the given embedding name.
-        """
-
-        if ntype is None:
-            if len(self.ntypes) == 1:
-                ntype = HOMOGENEOUS_NODE_TYPE
-            else:
-                raise ValueError("Must provide the node type for a heterogeneous graph")
-
-        if dgl.base.is_all(u):
-            return EmbeddingView(
-                self.__ndata_storage[ntype, emb_name], self.num_nodes(ntype)
-            )
-
-        try:
-            return self.__ndata_storage[ntype, emb_name].fetch(
-                _cast_to_torch_tensor(u), "cuda"
-            )
-        except RuntimeError as ex:
-            warnings.warn(
-                "Got error accessing data, trying again with index on device: "
-                + str(ex)
-            )
-            return self.__ndata_storage[ntype, emb_name].fetch(
-                _cast_to_torch_tensor(u).cuda(), "cuda"
-            )
-
-    def _has_e_emb(self, etype: Tuple[str, str, str], emb_name: str) -> bool:
-        return (etype, emb_name) in self.__edata_storage
-
-    def _get_e_emb(
-        self, etype: Tuple[str, str, str], emb_name: str, u: Union[str, TensorType]
-    ) -> "torch.Tensor":
-        """
-        Gets the embedding of a single edge type.
-        Unlike DGL, this function takes the canonical edge type
-        instead of an integer id.
-
-        Parameters
-        ----------
-        etype: str
-            The edge type to get the embedding of.
-        emb_name: str
-            The embedding name of the embedding to get.
-        u: Union[str, TensorType]
-            Edges to get the representation of, or ALL to
-            get the representation of all nodes of the
-            given type.
-
-        Returns
-        -------
-        torch.Tensor
-            The embedding of the given edge type with the given embedding name.
-        """
-
-        etype = self.to_canonical_etype(etype)
-
-        if dgl.base.is_all(u):
-            return EmbeddingView(
-                self.__edata_storage[etype, emb_name], self.num_edges(etype)
-            )
-
-        try:
-            return self.__edata_storage[etype, emb_name].fetch(
-                _cast_to_torch_tensor(u), "cuda"
-            )
-        except RuntimeError as ex:
-            warnings.warn(
-                "Got error accessing data, trying again with index on device: "
-                + str(ex)
-            )
-            return self.__edata_storage[etype, emb_name].fetch(
-                _cast_to_torch_tensor(u).cuda(), "cuda"
-            )
-
-    def _set_n_emb(
-        self, ntype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
-    ) -> None:
-        """
-        Stores or updates the embedding(s) of a single node type.
-        Unlike DGL, this function takes the string node type name
-        instead of an integer id.
-
-        The semantics of this function match those of add_nodes
-        with respect to whether or not the backing feature store
-        is distributed.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type to store an embedding of.
-        u: Union[str, TensorType]
-            The indices to update, if updating the embedding.
-            Currently, updating a slice of an embedding is
-            unsupported, so this should be ALL.
-        kv: Dict[str, TensorType]
-            A mapping of embedding names to embedding tensors.
-        """
-
-        if not dgl.base.is_all(u):
-            raise NotImplementedError(
-                "Updating a slice of an embedding is "
-                "currently unimplemented in cuGraph-DGL."
-            )
-
-        for k, v in kv:
-            self.__ndata_storage[ntype, k] = self.__ndata_storage_type(
-                v,
-                **self.__wg_kwargs,
-            )
-
-    def _set_e_emb(
-        self, etype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
-    ) -> None:
-        """
-        Stores or updates the embedding(s) of a single edge type.
-        Unlike DGL, this function takes the canonical edge type name
-        instead of an integer id.
-
-        The semantics of this function match those of add_edges
-        with respect to whether or not the backing feature store
-        is distributed.
-
-        Parameters
-        ----------
-        etype: str
-            The edge type to store an embedding of.
-        u: Union[str, TensorType]
-            The indices to update, if updating the embedding.
-            Currently, updating a slice of an embedding is
-            unsupported, so this should be ALL.
-        kv: Dict[str, TensorType]
-            A mapping of embedding names to embedding tensors.
-        """
-
-        if not dgl.base.is_all(u):
-            raise NotImplementedError(
-                "Updating a slice of an embedding is "
-                "currently unimplemented in cuGraph-DGL."
-            )
-
-        for k, v in kv:
-            self.__edata_storage[etype, k] = self.__edata_storage_type(
-                v,
-                **self.__wg_kwargs,
-            )
-
-    def _pop_n_emb(self, ntype: str, key: str) -> "torch.Tensor":
-        """
-        Removes and returns the embedding of the given node
-        type with the given name.
-
-        Parameters
-        ----------
-        ntype:str
-            The node type.
-        key:str
-            The embedding name.
-
-        Returns
-        -------
-        The removed embedding.
-        """
-        return self.__ndata_storage[ntype, key].pop(key)
-
-    def _pop_e_emb(self, etype: str, key: str) -> "torch.Tensor":
-        """
-        Removes and returns the embedding of the given edge
-        type with the given name.
-
-        Parameters
-        ----------
-        etype:str
-            The node type.
-        key:str
-            The embedding name.
-
-        Returns
-        -------
-        torch.Tensor
-            The removed embedding.
-        """
-        return self.__edata_storage[etype, key].pop(key)
-
-    def _get_n_emb_keys(self, ntype: str) -> List[str]:
-        """
-        Gets a list of the embedding names for a given node
-        type.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type to get embedding names for.
-
-        Returns
-        -------
-        List[str]
-            The list of embedding names for the given node type.
-        """
-        return [k for (t, k) in self.__ndata_storage if ntype == t]
-
-    def _get_e_emb_keys(self, etype: str) -> List[str]:
-        """
-        Gets a list of the embedding names for a given edge
-        type.
-
-        Parameters
-        ----------
-        etype: str
-            The edge type to get embedding names for.
-
-        Returns
-        -------
-        List[str]
-            The list of embedding names for the given edge type.
-        """
-        return [k for (t, k) in self.__edata_storage if etype == t]
-
-    def all_edges(
-        self,
-        form="uv",
-        order="eid",
-        etype: Union[str, Tuple[str, str, str]] = None,
-        device: Union[str, int, "torch.device"] = "cpu",
-    ):
-        """
-        Returns all edges with the specified edge type.
-        cuGraph-DGL currently only supports 'eid' format and
-        'eid' order.
-
-        Parameters
-        ----------
-        form: str (optional, default='uv')
-            The format to return ('uv', 'eid', 'all').
-
-        order: str (optional, default='eid')
-            The order to return edges in ('eid', 'srcdst')
-            cuGraph-DGL currently only supports 'eid'.
-        etype: Union[str, Tuple[str, str, str]] (optional, default=None)
-            The edge type to get.  Not required if this is
-            a homogeneous graph.  Can be the relation type if the
-            relation type is unique, or the canonical edge type.
-        device: Union[str, int, torch.device] (optional, default='cpu')
-            The device where returned edges should be stored
-            ('cpu', 'cuda', or device id).
-        """
-
-        if order != "eid":
-            raise NotImplementedError("cugraph-DGL only supports eid order.")
-
-        if etype is None and len(self.canonical_etypes) > 1:
-            raise ValueError("Edge type is required for heterogeneous graphs.")
-
-        etype = self.to_canonical_etype(etype)
-
-        if form == "eid":
-            return torch.arange(
-                0,
-                self.__num_edges_dict[etype],
-                dtype=self.idtype,
-                device=device,
-            )
-        else:
-            if self.is_multi_gpu:
-                # This can't be done because it requires collective communication.
-                raise ValueError(
-                    "Calling all_edges in a distributed graph with"
-                    " form 'uv' or 'all' is unsupported."
-                )
-
-            else:
-                eix = self.__edge_indices[etype].to(device)
-                if form == "uv":
-                    return eix[0], eix[1]
-                elif form == "all":
-                    return (
-                        eix[0],
-                        eix[1],
-                        torch.arange(
-                            self.__num_edges_dict[etype],
-                            dtype=self.idtype,
-                            device=device,
-                        ),
-                    )
-                else:
-                    raise ValueError(f"Invalid form {form}")
-
-    @property
-    def ndata(self) -> HeteroNodeDataView:
-        """
-        Returns a view of the node data in this graph which can be used to
-        access or modify node features.
-        """
-
-        if len(self.ntypes) == 1:
-            ntype = self.ntypes[0]
-            return HeteroNodeDataView(self, ntype, dgl.base.ALL)
-
-        return HeteroNodeDataView(self, self.ntypes, dgl.base.ALL)
-
-    @property
-    def edata(self) -> HeteroEdgeDataView:
-        """
-        Returns a view of the edge data in this graph which can be used to
-        access or modify edge features.
-        """
-        if len(self.canonical_etypes) == 1:
-            return HeteroEdgeDataView(self, None, dgl.base.ALL)
-
-        return HeteroEdgeDataView(self, self.canonical_etypes, dgl.base.ALL)
-
-    @property
-    def nodes(self) -> HeteroNodeView:
-        """
-        Returns a view of the nodes in this graph.
-        """
-        return HeteroNodeView(self)
-
-    @property
-    def edges(self) -> HeteroEdgeView:
-        """
-        Returns a view of the edges in this graph.
-        """
-        return HeteroEdgeView(self)
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/__init__.py
deleted file mode 100644
index 9a4a087baf4..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .conv import *  # noqa
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
deleted file mode 100644
index 3e7f2f076f0..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .base import SparseGraph
-from .gatconv import GATConv
-from .gatv2conv import GATv2Conv
-from .relgraphconv import RelGraphConv
-from .sageconv import SAGEConv
-from .transformerconv import TransformerConv
-
-__all__ = [
-    "SparseGraph",
-    "GATConv",
-    "GATv2Conv",
-    "RelGraphConv",
-    "SAGEConv",
-    "TransformerConv",
-]
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
deleted file mode 100644
index fcd5a26aee6..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-
-torch = import_optional("torch")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-dgl = import_optional("dgl")
-
-
-def compress_ids(ids: torch.Tensor, size: int) -> torch.Tensor:
-    return torch._convert_indices_from_coo_to_csr(
-        ids, size, out_int32=ids.dtype == torch.int32
-    )
-
-
-def decompress_ids(c_ids: torch.Tensor) -> torch.Tensor:
-    ids = torch.arange(c_ids.numel() - 1, dtype=c_ids.dtype, device=c_ids.device)
-    return ids.repeat_interleave(c_ids[1:] - c_ids[:-1])
-
-
-class SparseGraph(object):
-    r"""A class to create and store different sparse formats needed by
-    cugraph-ops. It always creates a CSC representation and can provide COO- or
-    CSR-format if needed.
-
-    Parameters
-    ----------
-    size: tuple of int
-        Size of the adjacency matrix: (num_src_nodes, num_dst_nodes).
-
-    src_ids: torch.Tensor
-        Source indices of the edges.
-
-    dst_ids: torch.Tensor, optional
-        Destination indices of the edges.
-
-    csrc_ids: torch.Tensor, optional
-        Compressed source indices. It is a monotonically increasing array of
-        size (num_src_nodes + 1,). For the k-th source node, its neighborhood
-        consists of the destinations between `dst_indices[csrc_indices[k]]` and
-        `dst_indices[csrc_indices[k+1]]`.
-
-    cdst_ids: torch.Tensor, optional
-        Compressed destination indices. It is a monotonically increasing array of
-        size (num_dst_nodes + 1,). For the k-th destination node, its neighborhood
-        consists of the sources between `src_indices[cdst_indices[k]]` and
-        `src_indices[cdst_indices[k+1]]`.
-
-    values: torch.Tensor, optional
-        Values on the edges.
-
-    is_sorted: bool
-        Whether the COO inputs (src_ids, dst_ids, values) have been sorted by
-        `dst_ids` in an ascending order. CSC layout creation is much faster
-        when sorted.
-
-    formats: str or tuple of str, optional
-        The desired sparse formats to create for the graph. The formats tuple
-        must include "csc". Default: "csc".
-
-    reduce_memory: bool, optional
-        When set, the tensors are not required by the desired formats will be
-        set to `None`. Default: True.
-
-    Notes
-    -----
-    For MFGs (sampled graphs), the node ids must have been renumbered.
-    """
-
-    supported_formats = {
-        "coo": ("_src_ids", "_dst_ids"),
-        "csc": ("_cdst_ids", "_src_ids"),
-        "csr": ("_csrc_ids", "_dst_ids", "_perm_csc2csr"),
-    }
-
-    all_tensors = set(
-        [
-            "_src_ids",
-            "_dst_ids",
-            "_csrc_ids",
-            "_cdst_ids",
-            "_perm_coo2csc",
-            "_perm_csc2csr",
-        ]
-    )
-
-    def __init__(
-        self,
-        size: Tuple[int, int],
-        src_ids: torch.Tensor,
-        dst_ids: Optional[torch.Tensor] = None,
-        csrc_ids: Optional[torch.Tensor] = None,
-        cdst_ids: Optional[torch.Tensor] = None,
-        values: Optional[torch.Tensor] = None,
-        is_sorted: bool = False,
-        formats: Union[str, Tuple[str]] = "csc",
-        reduce_memory: bool = True,
-    ):
-        self._num_src_nodes, self._num_dst_nodes = size
-        self._is_sorted = is_sorted
-
-        if dst_ids is None and cdst_ids is None:
-            raise ValueError(
-                "One of 'dst_ids' and 'cdst_ids' must be given "
-                "to create a SparseGraph."
-            )
-
-        if src_ids is not None:
-            src_ids = src_ids.contiguous()
-
-        if dst_ids is not None:
-            dst_ids = dst_ids.contiguous()
-
-        if csrc_ids is not None:
-            if csrc_ids.numel() != self._num_src_nodes + 1:
-                raise RuntimeError(
-                    f"Size mismatch for 'csrc_ids': expected ({size[0] + 1},), "
-                    f"but got {tuple(csrc_ids.size())}"
-                )
-            csrc_ids = csrc_ids.contiguous()
-
-        if cdst_ids is not None:
-            if cdst_ids.numel() != self._num_dst_nodes + 1:
-                raise RuntimeError(
-                    f"Size mismatch for 'cdst_ids': expected ({size[1] + 1},), "
-                    f"but got {tuple(cdst_ids.size())}"
-                )
-            cdst_ids = cdst_ids.contiguous()
-
-        if values is not None:
-            values = values.contiguous()
-
-        self._src_ids = src_ids
-        self._dst_ids = dst_ids
-        self._csrc_ids = csrc_ids
-        self._cdst_ids = cdst_ids
-        self._values = values
-        self._perm_coo2csc = None
-        self._perm_csc2csr = None
-
-        if isinstance(formats, str):
-            formats = (formats,)
-        self._formats = formats
-
-        if "csc" not in formats:
-            raise ValueError(
-                f"{self.__class__.__name__}.formats must contain "
-                f"'csc', but got {formats}."
-            )
-
-        # always create csc first
-        if self._cdst_ids is None:
-            if not self._is_sorted:
-                self._dst_ids, self._perm_coo2csc = torch.sort(self._dst_ids)
-                self._src_ids = self._src_ids[self._perm_coo2csc]
-                if self._values is not None:
-                    self._values = self._values[self._perm_coo2csc]
-            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
-
-        for format_ in formats:
-            assert format_ in SparseGraph.supported_formats
-            self.__getattribute__(f"{format_}")()
-
-        self._reduce_memory = reduce_memory
-        if reduce_memory:
-            self.reduce_memory()
-
-    def reduce_memory(self):
-        """Remove the tensors that are not necessary to create the desired sparse
-        formats to reduce memory footprint."""
-        if self._formats is None:
-            return
-
-        tensors_needed = []
-        for f in self._formats:
-            tensors_needed += SparseGraph.supported_formats[f]
-        for t in SparseGraph.all_tensors.difference(set(tensors_needed)):
-            self.__dict__[t] = None
-
-    def src_ids(self) -> torch.Tensor:
-        return self._src_ids
-
-    def cdst_ids(self) -> torch.Tensor:
-        return self._cdst_ids
-
-    def dst_ids(self) -> torch.Tensor:
-        if self._dst_ids is None:
-            self._dst_ids = decompress_ids(self._cdst_ids)
-        return self._dst_ids
-
-    def csrc_ids(self) -> torch.Tensor:
-        if self._csrc_ids is None:
-            src_ids, self._perm_csc2csr = torch.sort(self._src_ids)
-            self._csrc_ids = compress_ids(src_ids, self._num_src_nodes)
-        return self._csrc_ids
-
-    def num_src_nodes(self):
-        return self._num_src_nodes
-
-    def num_dst_nodes(self):
-        return self._num_dst_nodes
-
-    def values(self):
-        return self._values
-
-    def formats(self):
-        return self._formats
-
-    def coo(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        if "coo" not in self.formats():
-            raise RuntimeError(
-                "The SparseGraph did not create a COO layout. "
-                "Set 'formats' list to include 'coo' when creating the graph."
-            )
-        return self.src_ids(), self.dst_ids(), self._values
-
-    def csc(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        if "csc" not in self.formats():
-            raise RuntimeError(
-                "The SparseGraph did not create a CSC layout. "
-                "Set 'formats' list to include 'csc' when creating the graph."
-            )
-        return self.cdst_ids(), self.src_ids(), self._values
-
-    def csr(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        if "csr" not in self.formats():
-            raise RuntimeError(
-                "The SparseGraph did not create a CSR layout. "
-                "Set 'formats' list to include 'csr' when creating the graph."
-            )
-        csrc_ids = self.csrc_ids()
-        dst_ids = self.dst_ids()[self._perm_csc2csr]
-        value = self._values
-        if value is not None:
-            value = value[self._perm_csc2csr]
-        return csrc_ids, dst_ids, value
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}(num_src_nodes={self._num_src_nodes}, "
-            f"num_dst_nodes={self._num_dst_nodes}, "
-            f"num_edges={self._src_ids.size(0)}, formats={self._formats})"
-        )
-
-    def to(self, device: Union[torch.device, str, int]) -> "cugraph_dgl.nn.SparseGraph":
-        sg = SparseGraph(
-            src_ids=None if self._src_ids is None else self._src_ids.to(device),
-            dst_ids=None if self._dst_ids is None else self._dst_ids.to(device),
-            csrc_ids=None if self._csrc_ids is None else self._csrc_ids.to(device),
-            cdst_ids=None if self._cdst_ids is None else self._cdst_ids.to(device),
-            values=None if self._values is None else self._values.to(device),
-            is_sorted=self._is_sorted,
-            formats=self._formats,
-            reduce_memory=self._reduce_memory,
-        )
-
-        sg._perm_coo2csc = (
-            None if self._perm_coo2csc is None else self._perm_coo2csc.to(device)
-        )
-        sg._perm_csc2csr = (
-            None if self._perm_csc2csr is None else self._perm_csc2csr.to(device)
-        )
-
-        return sg
-
-
-class BaseConv(torch.nn.Module):
-    r"""An abstract base class for cugraph-ops nn module."""
-
-    def __init__(self):
-        super().__init__()
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        raise NotImplementedError
-
-    def forward(self, *args):
-        r"""Runs the forward pass of the module."""
-        raise NotImplementedError
-
-    def get_cugraph_ops_CSC(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        is_bipartite: bool = False,
-        max_in_degree: Optional[int] = None,
-    ) -> ops_torch.CSC:
-        """Create CSC structure needed by cugraph-ops."""
-
-        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
-            raise TypeError(
-                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
-
-        # TODO: max_in_degree should default to None in pylibcugraphops
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            offsets, indices, _ = g.csc()
-        else:
-            offsets, indices, _ = g.adj_tensors("csc")
-
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=is_bipartite,
-        )
-
-        return graph
-
-    def get_cugraph_ops_HeteroCSC(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        num_edge_types: int,
-        etypes: Optional[torch.Tensor] = None,
-        is_bipartite: bool = False,
-        max_in_degree: Optional[int] = None,
-    ) -> ops_torch.HeteroCSC:
-        """Create HeteroCSC structure needed by cugraph-ops."""
-
-        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
-            raise TypeError(
-                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
-
-        # TODO: max_in_degree should default to None in pylibcugraphops
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            offsets, indices, etypes = g.csc()
-            if etypes is None:
-                raise ValueError(
-                    "SparseGraph must have 'values' to create HeteroCSC. "
-                    "Pass in edge types as 'values' when creating the SparseGraph."
-                )
-            etypes = etypes.int()
-        else:
-            if etypes is None:
-                raise ValueError(
-                    "'etypes' is required when creating HeteroCSC "
-                    "from dgl.DGLHeteroGraph."
-                )
-            offsets, indices, perm = g.adj_tensors("csc")
-            etypes = etypes[perm].int()
-
-        graph = ops_torch.HeteroCSC(
-            offsets=offsets,
-            indices=indices,
-            edge_types=etypes,
-            num_src_nodes=g.num_src_nodes(),
-            num_edge_types=num_edge_types,
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=is_bipartite,
-        )
-
-        return graph
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
deleted file mode 100644
index e8813271fd8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class GATConv(BaseConv):
-    r"""Graph attention layer from `Graph Attention Network
-    <https://arxiv.org/pdf/1710.10903.pdf>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or (int, int)
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_feats : int
-        Output feature size.
-    num_heads : int
-        Number of heads in multi-head attention.
-    feat_drop : float, optional
-        Dropout rate on feature. Defaults: ``0``.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    edge_feats : int, optional
-        Edge feature size. Default: ``None``.
-    negative_slope : float, optional
-        LeakyReLU angle of negative slope. Defaults: ``0.2``.
-    residual : bool, optional
-        If True, use residual connection. Defaults: ``False``.
-    allow_zero_in_degree : bool, optional
-        If there are 0-in-degree nodes in the graph, output for those nodes will
-        be invalid since no message will be passed to those nodes. This is
-        harmful for some applications causing silent performance regression.
-        This module will raise a DGLError if it detects 0-in-degree nodes in
-        input graph. By setting ``True``, it will suppress the check and let the
-        users handle it by themselves. Defaults: ``False``.
-    bias : bool, optional
-        If True, learns a bias term. Defaults: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import GATConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> g = dgl.add_self_loop(g)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = GATConv(10, 2, num_heads=3).to(device)
-    >>> res = conv(g, feat)
-    >>> res
-    tensor([[[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]]], device='cuda:0', grad_fn=<ViewBackward0>)
-    """
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        num_heads: int,
-        feat_drop: float = 0.0,
-        concat: bool = True,
-        edge_feats: Optional[int] = None,
-        negative_slope: float = 0.2,
-        residual: bool = False,
-        allow_zero_in_degree: bool = False,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if isinstance(in_feats, int):
-            self.in_feats_src = self.in_feats_dst = in_feats
-        else:
-            self.in_feats_src, self.in_feats_dst = in_feats
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_heads = num_heads
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.concat = concat
-        self.edge_feats = edge_feats
-        self.negative_slope = negative_slope
-        self.residual = residual
-        self.allow_zero_in_degree = allow_zero_in_degree
-
-        if isinstance(in_feats, int):
-            self.lin = nn.Linear(in_feats, num_heads * out_feats, bias=False)
-        else:
-            self.lin_src = nn.Linear(
-                self.in_feats_src, num_heads * out_feats, bias=False
-            )
-            self.lin_dst = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=False
-            )
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-            self.attn_weights = nn.Parameter(torch.empty(3 * num_heads * out_feats))
-        else:
-            self.register_parameter("lin_edge", None)
-            self.attn_weights = nn.Parameter(torch.empty(2 * num_heads * out_feats))
-
-        out_dim = num_heads * out_feats if concat else out_feats
-        if residual:
-            if self.in_feats_dst != out_dim:
-                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
-            else:
-                self.lin_res = nn.Identity()
-        else:
-            self.register_buffer("lin_res", None)
-
-        if bias and not isinstance(self.lin_res, nn.Linear):
-            if concat:
-                self.bias = nn.Parameter(torch.empty(num_heads, out_feats))
-            else:
-                self.bias = nn.Parameter(torch.empty(out_feats))
-        else:
-            self.register_buffer("bias", None)
-
-        self.reset_parameters()
-
-    def set_allow_zero_in_degree(self, set_value):
-        r"""Set allow_zero_in_degree flag."""
-        self.allow_zero_in_degree = set_value
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain("relu")
-        if hasattr(self, "lin"):
-            nn.init.xavier_normal_(self.lin.weight, gain=gain)
-        else:
-            nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
-            nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
-
-        nn.init.xavier_normal_(
-            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
-        )
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        if self.lin_res is not None:
-            self.lin_res.reset_parameters()
-
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-        max_in_degree: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-        high_precision_dgrad: bool = False,
-        high_precision_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        graph : DGLGraph or SparseGraph
-            The graph.
-        nfeat : torch.Tensor or (torch.Tensor, torch.Tensor)
-            Node features. If given as a tuple, the two elements correspond to
-            the source and destination node features, respectively, in a
-            bipartite graph.
-        efeat: torch.Tensor, optional
-            Optional edge features.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-        deterministic_dgrad : bool, default=False
-            Optional flag indicating whether the feature gradients
-            are computed deterministically using a dedicated workspace buffer.
-        deterministic_wgrad: bool, default=False
-            Optional flag indicating whether the weight gradients
-            are computed deterministically using a dedicated workspace buffer.
-        high_precision_dgrad: bool, default=False
-            Optional flag indicating whether gradients for inputs in half precision
-            are kept in single precision as long as possible and only casted to
-            the corresponding input type at the very end.
-        high_precision_wgrad: bool, default=False
-            Optional flag indicating whether gradients for weights in half precision
-            are kept in single precision as long as possible and only casted to
-            the corresponding input type at the very end.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, H, D_{out})` where
-            :math:`H` is the number of heads, and :math:`D_{out}` is size of
-            output feature.
-        """
-        if isinstance(g, dgl.DGLHeteroGraph):
-            if not self.allow_zero_in_degree:
-                if (g.in_degrees() == 0).any():
-                    raise dgl.base.DGLError(
-                        "There are 0-in-degree nodes in the graph, "
-                        "output for those nodes will be invalid. "
-                        "This is harmful for some applications, "
-                        "causing silent performance regression. "
-                        "Adding self-loop on the input graph by "
-                        "calling `g = dgl.add_self_loop(g)` will resolve "
-                        "the issue. Setting ``allow_zero_in_degree`` "
-                        "to be `True` when constructing this module will "
-                        "suppress the check and let the code run."
-                    )
-
-        bipartite = isinstance(nfeat, (list, tuple))
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=bipartite, max_in_degree=max_in_degree
-        )
-        if deterministic_dgrad:
-            _graph.add_reverse_graph()
-
-        if bipartite:
-            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
-            nfeat_dst_orig = nfeat[1]
-        else:
-            nfeat = self.feat_drop(nfeat)
-            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to "
-                    f"accept edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        if bipartite:
-            if not hasattr(self, "lin_src"):
-                nfeat_src = self.lin(nfeat[0])
-                nfeat_dst = self.lin(nfeat[1])
-            else:
-                nfeat_src = self.lin_src(nfeat[0])
-                nfeat_dst = self.lin_dst(nfeat[1])
-        else:
-            if not hasattr(self, "lin"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_feats is expected to be an "
-                    f"integer when the graph is not bipartite, "
-                    f"but got {self.in_feats}."
-                )
-            nfeat = self.lin(nfeat)
-
-        out = ops_torch.operators.mha_gat_n2n(
-            (nfeat_src, nfeat_dst) if bipartite else nfeat,
-            self.attn_weights,
-            _graph,
-            num_heads=self.num_heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=efeat,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-            high_precision_dgrad=high_precision_dgrad,
-            high_precision_wgrad=high_precision_wgrad,
-        )[: g.num_dst_nodes()]
-
-        if self.concat:
-            out = out.view(-1, self.num_heads, self.out_feats)
-
-        if self.residual:
-            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
-            if not self.concat:
-                res = res.mean(dim=1)
-            out = out + res
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
deleted file mode 100644
index 4f47005f8ee..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class GATv2Conv(BaseConv):
-    r"""GATv2 from `How Attentive are Graph Attention Networks?
-    <https://arxiv.org/pdf/2105.14491.pdf>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or (int, int)
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_feats : int
-        Output feature size.
-    num_heads : int
-        Number of heads in Multi-Head Attention.
-    feat_drop : float, optional
-        Dropout rate on feature. Defaults: ``0``.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    edge_feats : int, optional
-        Edge feature size. Default: ``None``.
-    negative_slope : float, optional
-        LeakyReLU angle of negative slope. Defaults: ``0.2``.
-    residual : bool, optional
-        If True, use residual connection. Defaults: ``False``.
-    allow_zero_in_degree : bool, optional
-        If there are 0-in-degree nodes in the graph, output for those nodes will
-        be invalid since no message will be passed to those nodes. This is
-        harmful for some applications causing silent performance regression.
-        This module will raise a DGLError if it detects 0-in-degree nodes in
-        input graph. By setting ``True``, it will suppress the check and let the
-        users handle it by themselves. Defaults: ``False``.
-    bias : bool, optional
-        If True, learns a bias term. Defaults: ``True``.
-    share_weights : bool, optional
-        If ``True``, the same matrix will be applied to the source and the
-        destination node features. Defaults: ``False``.
-    """
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        num_heads: int,
-        feat_drop: float = 0.0,
-        concat: bool = True,
-        edge_feats: Optional[int] = None,
-        negative_slope: float = 0.2,
-        residual: bool = False,
-        allow_zero_in_degree: bool = False,
-        bias: bool = True,
-        share_weights: bool = False,
-    ):
-        super().__init__()
-
-        if isinstance(in_feats, int):
-            self.in_feats_src = self.in_feats_dst = in_feats
-        else:
-            self.in_feats_src, self.in_feats_dst = in_feats
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_heads = num_heads
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.concat = concat
-        self.edge_feats = edge_feats
-        self.negative_slope = negative_slope
-        self.residual = residual
-        self.allow_zero_in_degree = allow_zero_in_degree
-        self.share_weights = share_weights
-        self.bias = bias
-
-        self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
-        if share_weights:
-            if self.in_feats_src != self.in_feats_dst:
-                raise ValueError(
-                    f"Input feature size of source and destination "
-                    f"nodes must be identical when share_weights is enabled, "
-                    f"but got {self.in_feats_src} and {self.in_feats_dst}."
-                )
-            self.lin_dst = self.lin_src
-        else:
-            self.lin_dst = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
-
-        self.attn_weights = nn.Parameter(torch.empty(num_heads * out_feats))
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-        else:
-            self.register_parameter("lin_edge", None)
-
-        out_dim = num_heads * out_feats if concat else out_feats
-        if residual:
-            if self.in_feats_dst != out_dim:
-                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
-            else:
-                self.lin_res = nn.Identity()
-        else:
-            self.register_buffer("lin_res", None)
-
-        self.reset_parameters()
-
-    def set_allow_zero_in_degree(self, set_value):
-        r"""Set allow_zero_in_degree flag."""
-        self.allow_zero_in_degree = set_value
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain("relu")
-        nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
-        nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
-
-        nn.init.xavier_normal_(
-            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
-        )
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        if self.lin_res is not None:
-            self.lin_res.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-        max_in_degree: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        graph : DGLGraph or SparseGraph
-            The graph.
-        nfeat : torch.Tensor
-            Input features of shape :math:`(N, D_{in})`.
-        efeat: torch.Tensor, optional
-            Optional edge features.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-        deterministic_dgrad : bool, default=False
-            Optional flag indicating whether the feature gradients
-            are computed deterministically using a dedicated workspace buffer.
-        deterministic_wgrad: bool, default=False
-            Optional flag indicating whether the weight gradients
-            are computed deterministically using a dedicated workspace buffer.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, H, D_{out})` where
-            :math:`H` is the number of heads, and :math:`D_{out}` is size of
-            output feature.
-        """
-
-        if isinstance(g, dgl.DGLHeteroGraph):
-            if not self.allow_zero_in_degree:
-                if (g.in_degrees() == 0).any():
-                    raise dgl.base.DGLError(
-                        "There are 0-in-degree nodes in the graph, "
-                        "output for those nodes will be invalid. "
-                        "This is harmful for some applications, "
-                        "causing silent performance regression. "
-                        "Adding self-loop on the input graph by "
-                        "calling `g = dgl.add_self_loop(g)` will resolve "
-                        "the issue. Setting ``allow_zero_in_degree`` "
-                        "to be `True` when constructing this module will "
-                        "suppress the check and let the code run."
-                    )
-
-        nfeat_bipartite = isinstance(nfeat, (list, tuple))
-        graph_bipartite = nfeat_bipartite or self.share_weights is False
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
-        )
-        if deterministic_dgrad:
-            _graph.add_reverse_graph()
-
-        if nfeat_bipartite:
-            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
-            nfeat_dst_orig = nfeat[1]
-        else:
-            nfeat = self.feat_drop(nfeat)
-            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to "
-                    f"accept edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        if nfeat_bipartite:
-            nfeat = (self.lin_src(nfeat[0]), self.lin_dst(nfeat[1]))
-        elif graph_bipartite:
-            nfeat = (self.lin_src(nfeat), self.lin_dst(nfeat[: g.num_dst_nodes()]))
-        else:
-            nfeat = self.lin_src(nfeat)
-
-        out = ops_torch.operators.mha_gat_v2_n2n(
-            nfeat,
-            self.attn_weights,
-            _graph,
-            num_heads=self.num_heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=efeat,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-        )[: g.num_dst_nodes()]
-
-        if self.concat:
-            out = out.view(-1, self.num_heads, self.out_feats)
-
-        if self.residual:
-            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
-            if not self.concat:
-                res = res.mean(dim=1)
-            out = out + res
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
deleted file mode 100644
index 5c4b5dea441..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class RelGraphConv(BaseConv):
-    r"""An accelerated relational graph convolution layer from `Modeling
-    Relational Data with Graph Convolutional Networks
-    <https://arxiv.org/abs/1703.06103>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    num_rels : int
-        Number of relations.
-    regularizer : str, optional
-        Which weight regularizer to use ("basis" or ``None``):
-         - "basis" is for basis-decomposition.
-         - ``None`` applies no regularization.
-        Default: ``None``.
-    num_bases : int, optional
-        Number of bases. It comes into effect when a regularizer is applied.
-        Default: ``None``.
-    bias : bool, optional
-        True if bias is added. Default: ``True``.
-    self_loop : bool, optional
-        True to include self loop message. Default: ``True``.
-    dropout : float, optional
-        Dropout rate. Default: ``0.0``.
-    apply_norm : bool, optional
-        True to normalize aggregation output by the in-degree of the destination
-        node per edge type, i.e. :math:`|\mathcal{N}^r_i|`. Default: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import RelGraphConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = RelGraphConv(
-    ...     10, 2, 3, regularizer='basis', num_bases=2).to(device)
-    >>> etypes = torch.tensor([0,1,2,0,1,2]).to(device)
-    >>> res = conv(g, feat, etypes)
-    >>> res
-    tensor([[-1.7774, -2.0184],
-            [-1.4335, -2.3758],
-            [-1.7774, -2.0184],
-            [-0.4698, -3.0876],
-            [-1.4335, -2.3758],
-            [-1.4331, -2.3295]], device='cuda:0', grad_fn=<AddBackward0>)
-    """
-
-    def __init__(
-        self,
-        in_feats: int,
-        out_feats: int,
-        num_rels: int,
-        regularizer: Optional[str] = None,
-        num_bases: Optional[int] = None,
-        bias: bool = True,
-        self_loop: bool = True,
-        dropout: float = 0.0,
-        apply_norm: bool = False,
-    ):
-        super().__init__()
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_rels = num_rels
-        self.apply_norm = apply_norm
-        self.dropout = nn.Dropout(dropout)
-
-        dim_self_loop = 1 if self_loop else 0
-        self.self_loop = self_loop
-        if regularizer is None:
-            self.W = nn.Parameter(
-                torch.empty(num_rels + dim_self_loop, in_feats, out_feats)
-            )
-            self.coeff = None
-        elif regularizer == "basis":
-            if num_bases is None:
-                raise ValueError('Missing "num_bases" for basis regularization.')
-            self.W = nn.Parameter(
-                torch.empty(num_bases + dim_self_loop, in_feats, out_feats)
-            )
-            self.coeff = nn.Parameter(torch.empty(num_rels, num_bases))
-            self.num_bases = num_bases
-        else:
-            raise ValueError(
-                f"Supported regularizer options: 'basis' or None, but got "
-                f"'{regularizer}'."
-            )
-        self.regularizer = regularizer
-
-        if bias:
-            self.bias = nn.Parameter(torch.empty(out_feats))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        bound = 1 / math.sqrt(self.in_feats)
-        end = -1 if self.self_loop else None
-        nn.init.uniform_(self.W[:end], -bound, bound)
-        if self.regularizer == "basis":
-            nn.init.xavier_uniform_(self.coeff, gain=nn.init.calculate_gain("relu"))
-        if self.self_loop:
-            nn.init.xavier_uniform_(self.W[-1], nn.init.calculate_gain("relu"))
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: torch.Tensor,
-        etypes: torch.Tensor,
-        max_in_degree: Optional[int] = None,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        g : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            A 2D tensor of node features. Shape: :math:`(|V|, D_{in})`.
-        etypes : torch.Tensor
-            A 1D integer tensor of edge types. Shape: :math:`(|E|,)`.
-            Note that cugraph-ops only accepts edge type tensors in int32,
-            so any input of other integer types will be casted into int32,
-            thus introducing some overhead. Pass in int32 tensors directly
-            for best performance.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-
-        Returns
-        -------
-        torch.Tensor
-            New node features. Shape: :math:`(|V|, D_{out})`.
-        """
-        _graph = self.get_cugraph_ops_HeteroCSC(
-            g,
-            num_edge_types=self.num_rels,
-            etypes=etypes,
-            is_bipartite=False,
-            max_in_degree=max_in_degree,
-        )
-
-        h = ops_torch.operators.agg_hg_basis_n2n_post(
-            feat,
-            self.coeff,
-            _graph,
-            concat_own=self.self_loop,
-            norm_by_out_degree=self.apply_norm,
-        )[: g.num_dst_nodes()]
-        h = h @ self.W.view(-1, self.out_feats)
-        if self.bias is not None:
-            h = h + self.bias
-        h = self.dropout(h)
-
-        return h
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
deleted file mode 100644
index b6198903766..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class SAGEConv(BaseConv):
-    r"""An accelerated GraphSAGE layer from `Inductive Representation Learning
-    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`, with the sparse
-    aggregation accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or tuple
-        Input feature size. If a scalar is given, the source and destination
-        nodes are required to be the same.
-    out_feats : int
-        Output feature size.
-    aggregator_type : str
-        Aggregator type to use ("mean", "sum", "min", "max", "pool", "gcn").
-    feat_drop : float
-        Dropout rate on features, default: ``0``.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import SAGEConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> g = dgl.add_self_loop(g)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = SAGEConv(10, 2, 'mean').to(device)
-    >>> res = conv(g, feat)
-    >>> res
-    tensor([[-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952]], device='cuda:0', grad_fn=<AddmmBackward0>)
-    """
-    valid_aggr_types = {"mean", "sum", "min", "max", "pool", "gcn"}
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        aggregator_type: str = "mean",
-        feat_drop: float = 0.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggregator_type not in self.valid_aggr_types:
-            raise ValueError(
-                f"Invalid aggregator_type. Must be one of {self.valid_aggr_types}. "
-                f"But got '{aggregator_type}' instead."
-            )
-
-        self.aggregator_type = aggregator_type
-        self._aggr = aggregator_type
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
-        self.feat_drop = nn.Dropout(feat_drop)
-
-        if self.aggregator_type == "gcn":
-            self._aggr = "mean"
-            self.lin = nn.Linear(self.in_feats_src, out_feats, bias=bias)
-        else:
-            self.lin = nn.Linear(
-                self.in_feats_src + self.in_feats_dst, out_feats, bias=bias
-            )
-
-        if self.aggregator_type == "pool":
-            self._aggr = "max"
-            self.pre_lin = nn.Linear(self.in_feats_src, self.in_feats_src)
-        else:
-            self.register_parameter("pre_lin", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        self.lin.reset_parameters()
-        if self.pre_lin is not None:
-            self.pre_lin.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        max_in_degree: Optional[int] = None,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        g : DGLGraph or SparseGraph
-            The graph.
-        feat : torch.Tensor or tuple
-            Node features. Shape: :math:`(|V|, D_{in})`.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-
-        Returns
-        -------
-        torch.Tensor
-            Output node features. Shape: :math:`(|V|, D_{out})`.
-        """
-        feat_bipartite = isinstance(feat, (list, tuple))
-        graph_bipartite = feat_bipartite or self.aggregator_type == "pool"
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
-        )
-
-        if feat_bipartite:
-            feat = (self.feat_drop(feat[0]), self.feat_drop(feat[1]))
-        else:
-            feat = self.feat_drop(feat)
-
-        if self.aggregator_type == "pool":
-            if feat_bipartite:
-                feat = (self.pre_lin(feat[0]).relu(), feat[1])
-            else:
-                feat = (self.pre_lin(feat).relu(), feat[: g.num_dst_nodes()])
-            # force ctx.needs_input_grad=True in cugraph-ops autograd function
-            feat[0].requires_grad_()
-            feat[1].requires_grad_()
-
-        out = ops_torch.operators.agg_concat_n2n(feat, _graph, self._aggr)[
-            : g.num_dst_nodes()
-        ]
-
-        if self.aggregator_type == "gcn":
-            out = out[:, : self.in_feats_src]
-
-        out = self.lin(out)
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
deleted file mode 100644
index e77556fb76f..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class TransformerConv(BaseConv):
-    r"""The graph transformer layer from the `"Masked Label Prediction:
-    Unified Message Passing Model for Semi-Supervised Classification"
-    <https://arxiv.org/abs/2009.03509>`_ paper.
-
-    Parameters
-    ----------
-    in_node_feats : int or pair of ints
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_node_feats : int
-        Output feature size.
-    num_heads : int
-        Number of multi-head-attentions.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    beta : bool, optional
-        If True, use a gated residual connection. Default: ``True``.
-    edge_feats: int, optional
-        Edge feature size. Default: ``None``.
-    bias: bool, optional
-        If True, learns a bias term. Default: ``True``.
-    root_weight: bool, optional
-        If False, will skip to learn a root weight matrix. Default: ``True``.
-    """
-
-    def __init__(
-        self,
-        in_node_feats: Union[int, tuple[int, int]],
-        out_node_feats: int,
-        num_heads: int,
-        concat: bool = True,
-        beta: bool = False,
-        edge_feats: Optional[int] = None,
-        bias: bool = True,
-        root_weight: bool = True,
-    ):
-        super().__init__()
-
-        self.in_node_feats = in_node_feats
-        self.out_node_feats = out_node_feats
-        self.num_heads = num_heads
-        self.concat = concat
-        self.beta = beta
-        self.edge_feats = edge_feats
-        self.bias = bias
-        self.root_weight = root_weight
-
-        if isinstance(in_node_feats, int):
-            in_node_feats = (in_node_feats, in_node_feats)
-
-        self.lin_key = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
-        self.lin_query = nn.Linear(in_node_feats[1], num_heads * out_node_feats)
-        self.lin_value = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(
-                edge_feats, num_heads * out_node_feats, bias=False
-            )
-        else:
-            self.lin_edge = self.register_parameter("lin_edge", None)
-
-        if concat:
-            self.lin_skip = nn.Linear(
-                in_node_feats[1], num_heads * out_node_feats, bias=bias
-            )
-            if self.beta:
-                self.lin_beta = nn.Linear(3 * num_heads * out_node_feats, 1, bias=bias)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-        else:
-            self.lin_skip = nn.Linear(in_node_feats[1], out_node_feats, bias=bias)
-            if self.beta:
-                self.lin_beta = nn.Linear(3 * out_node_feats, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_key.reset_parameters()
-        self.lin_query.reset_parameters()
-        self.lin_value.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-        if self.lin_skip is not None:
-            self.lin_skip.reset_parameters()
-        if self.lin_beta is not None:
-            self.lin_beta.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward computation.
-
-        Parameters
-        ----------
-        g: DGLGraph
-            The graph.
-        nfeat: torch.Tensor or a pair of torch.Tensor
-            Node feature tensor. A pair denotes features for source and
-            destination nodes, respectively.
-        efeat: torch.Tensor, optional
-            Edge feature tensor. Default: ``None``.
-        """
-        feat_bipartite = isinstance(nfeat, (list, tuple))
-        if not feat_bipartite:
-            nfeat = (nfeat, nfeat)
-
-        _graph = self.get_cugraph_ops_CSC(g, is_bipartite=True)
-
-        query = self.lin_query(nfeat[1][: g.num_dst_nodes()])
-        key = self.lin_key(nfeat[0])
-        value = self.lin_value(nfeat[0])
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to allow "
-                    f"edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        out = ops_torch.operators.mha_simple_n2n(
-            key_emb=key,
-            query_emb=query,
-            value_emb=value,
-            graph=_graph,
-            num_heads=self.num_heads,
-            concat_heads=self.concat,
-            edge_emb=efeat,
-            norm_by_dim=True,
-            score_bias=None,
-        )[: g.num_dst_nodes()]
-
-        if self.root_weight:
-            res = self.lin_skip(nfeat[1][: g.num_dst_nodes()])
-            if self.lin_beta is not None:
-                beta = self.lin_beta(torch.cat([out, res, out - res], dim=-1))
-                beta = beta.sigmoid()
-                out = beta * res + (1 - beta) * out
-            else:
-                out = out + res
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/__init__.py b/python/cugraph-dgl/cugraph_dgl/tests/__init__.py
deleted file mode 100644
index 1144e9bab3f..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/conftest.py b/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
deleted file mode 100644
index ee1183f5cd1..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import dgl
-import torch
-
-from cugraph.testing.mg_utils import (
-    start_dask_client,
-    stop_dask_client,
-)
-
-
-@pytest.fixture(scope="module")
-def dask_client():
-    # start_dask_client will check for the SCHEDULER_FILE and
-    # DASK_WORKER_DEVICES env vars and use them when creating a client if
-    # set. start_dask_client will also initialize the Comms singleton.
-    dask_client, dask_cluster = start_dask_client(
-        dask_worker_devices="0", protocol="tcp"
-    )
-
-    yield dask_client
-
-    stop_dask_client(dask_client, dask_cluster)
-
-
-class SparseGraphData1:
-    size = (6, 5)
-    nnz = 6
-    src_ids = torch.IntTensor([0, 1, 2, 3, 2, 5]).cuda()
-    dst_ids = torch.IntTensor([1, 2, 3, 4, 0, 3]).cuda()
-    values = torch.IntTensor([10, 20, 30, 40, 50, 60]).cuda()
-
-    # CSR
-    src_ids_sorted_by_src = torch.IntTensor([0, 1, 2, 2, 3, 5]).cuda()
-    dst_ids_sorted_by_src = torch.IntTensor([1, 2, 0, 3, 4, 3]).cuda()
-    csrc_ids = torch.IntTensor([0, 1, 2, 4, 5, 5, 6]).cuda()
-    values_csr = torch.IntTensor([10, 20, 50, 30, 40, 60]).cuda()
-
-    # CSC
-    src_ids_sorted_by_dst = torch.IntTensor([2, 0, 1, 5, 2, 3]).cuda()
-    dst_ids_sorted_by_dst = torch.IntTensor([0, 1, 2, 3, 3, 4]).cuda()
-    cdst_ids = torch.IntTensor([0, 1, 2, 3, 5, 6]).cuda()
-    values_csc = torch.IntTensor([50, 10, 20, 60, 30, 40]).cuda()
-
-
-@pytest.fixture
-def sparse_graph_1():
-    return SparseGraphData1()
-
-
-@pytest.fixture
-def dgl_graph_1():
-    src = torch.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
-    dst = torch.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
-    return dgl.graph((src, dst))
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
deleted file mode 100644
index e2542657de4..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-import dgl
-import torch as th
-from cugraph_dgl import cugraph_storage_from_heterograph
-import tempfile
-import numpy as np
-
-
-def sample_dgl_graphs(g, train_nid, fanouts):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(fanouts)
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=1,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
-    tempdir_object = tempfile.TemporaryDirectory()
-    sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
-        cugraph_gs,
-        train_nid,
-        sampler,
-        batch_size=1,
-        sampling_output_dir=sampling_output_dir.name,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-def test_same_heterograph_results():
-    single_gpu = True
-    data_dict = {
-        ("B", "BA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("C", "CA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-    }
-    train_nid = {"A": th.tensor([0])}
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.heterograph(data_dict)
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [{"BA": 1, "CA": 1}])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"]["A"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"]["A"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-
-
-def test_same_homogeneousgraph_results():
-    single_gpu = True
-    train_nid = th.tensor([1])
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph(([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]))
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-
-def test_heterograph_multi_block_results():
-    data_dict = {
-        ("B", "BA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("C", "CA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("A", "AA", "A"): ([1], [0]),
-    }
-    dgl_g = dgl.heterograph(data_dict)
-    cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(dgl_g, single_gpu=True)
-    train_nid = {"A": th.tensor([0])}
-    cugraph_dgl_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [10, 10])
-    assert (
-        cugraph_dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_dgl_output[0]["blocks"][1].num_src_nodes()
-    )
-
-
-def test_homogenousgraph_multi_block_results():
-    dgl_g = dgl.graph(data=([1, 2, 2, 3, 4, 5], [0, 0, 1, 2, 2, 3]))
-    cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(dgl_g, single_gpu=True)
-    train_nid = th.tensor([0])
-    cugraph_dgl_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2, 2, 2])
-    assert (
-        cugraph_dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_dgl_output[0]["blocks"][1].num_src_nodes()
-    )
-    assert (
-        cugraph_dgl_output[0]["blocks"][1].num_dst_nodes()
-        == cugraph_dgl_output[0]["blocks"][2].num_src_nodes()
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
deleted file mode 100644
index d49e1293e77..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-import dgl
-import torch as th
-from cugraph_dgl import cugraph_storage_from_heterograph
-import tempfile
-import numpy as np
-
-
-def sample_dgl_graphs(g, train_nid, fanouts):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(fanouts)
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=1,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
-    tempdir_object = tempfile.TemporaryDirectory()
-    sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
-        cugraph_gs,
-        train_nid,
-        sampler,
-        batch_size=1,
-        sampling_output_dir=sampling_output_dir.name,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-def test_same_heterograph_results(dask_client):
-    single_gpu = False
-    data_dict = {
-        ("B", "BA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("C", "CA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-    }
-    train_nid = {"A": th.tensor([0])}
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.heterograph(data_dict)
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [{"BA": 1, "CA": 1}])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"]["A"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"]["A"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-
-
-def test_same_homogeneousgraph_results(dask_client):
-    single_gpu = False
-    train_nid = th.tensor([1])
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph(([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]))
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
deleted file mode 100644
index 419ec7790a9..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import cugraph_dgl.dataloading
-import pytest
-
-import cugraph_dgl
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import numpy as np
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_basic_homogeneous():
-    graph = cugraph_dgl.Graph(is_multi_gpu=False)
-
-    num_nodes = karate.number_of_nodes()
-    graph.add_nodes(num_nodes, data={"z": torch.arange(num_nodes)})
-
-    edf = karate.get_edgelist()
-    graph.add_edges(
-        u=edf["src"], v=edf["dst"], data={"q": torch.arange(karate.number_of_edges())}
-    )
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
-    loader = cugraph_dgl.dataloading.FutureDataLoader(
-        graph, torch.arange(num_nodes), sampler, batch_size=2
-    )
-
-    for in_t, out_t, blocks in loader:
-        assert len(blocks) == 3
-        assert len(out_t) <= 2
-
-
-def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1, prob_attr=None):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(
-    cugraph_g, train_nid, fanouts, batch_size=1, prob_attr=None
-):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        cugraph_g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("ix", [[1], [1, 0]])
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_same_homogeneousgraph_results(ix, batch_size):
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
-
-    train_nid = torch.tensor(ix)
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph((src, dst))
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=False)
-    cugraph_g.add_nodes(9)
-    cugraph_g.add_edges(u=src, v=dst)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_biased_homogeneous():
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
-    wgt = torch.tensor([1, 1, 2, 0, 0, 0, 2, 1], dtype=torch.float32)
-
-    train_nid = torch.tensor([0, 1])
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph((src, dst))
-    dgl_g.edata["wgt"] = wgt
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=False)
-    cugraph_g.add_nodes(9)
-    cugraph_g.add_edges(u=src, v=dst, data={"wgt": wgt})
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [4], batch_size=2, prob_attr="wgt")
-    cugraph_output = sample_cugraph_dgl_graphs(
-        cugraph_g, train_nid, [4], batch_size=2, prob_attr="wgt"
-    )
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-    assert 5 == cugraph_output[0]["blocks"][0].num_edges()
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
deleted file mode 100644
index 061f4fa2077..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import numpy as np
-
-import cugraph_dgl
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph.gnn import (
-    cugraph_comms_create_unique_id,
-    cugraph_comms_shutdown,
-)
-
-from cugraph_dgl.tests.utils import init_pytorch_worker
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-def run_test_dataloader_basic_homogeneous(rank, world_size, uid):
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph = cugraph_dgl.Graph(is_multi_gpu=True)
-
-    num_nodes = karate.number_of_nodes()
-    graph.add_nodes(
-        num_nodes,
-    )
-
-    edf = karate.get_edgelist()
-    graph.add_edges(
-        u=torch.tensor_split(torch.as_tensor(edf["src"], device="cuda"), world_size)[
-            rank
-        ],
-        v=torch.tensor_split(torch.as_tensor(edf["dst"], device="cuda"), world_size)[
-            rank
-        ],
-    )
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
-    loader = cugraph_dgl.dataloading.FutureDataLoader(
-        graph,
-        torch.arange(num_nodes),
-        sampler,
-        batch_size=2,
-        use_ddp=True,
-    )
-
-    for in_t, out_t, blocks in loader:
-        assert len(blocks) == 3
-        assert len(out_t) <= 2
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_basic_homogeneous():
-    uid = cugraph_comms_create_unique_id()
-    # Limit the number of GPUs this rest is run with
-    world_size = min(torch.cuda.device_count(), 4)
-
-    torch.multiprocessing.spawn(
-        run_test_dataloader_basic_homogeneous,
-        args=(
-            world_size,
-            uid,
-        ),
-        nprocs=world_size,
-    )
-
-
-def sample_dgl_graphs(
-    g,
-    train_nid,
-    fanouts,
-    batch_size=1,
-    prob_attr=None,
-):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(
-    cugraph_g,
-    train_nid,
-    fanouts,
-    batch_size=1,
-    prob_attr=None,
-):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        cugraph_g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-def run_test_same_homogeneousgraph_results(rank, world_size, uid, ix, batch_size):
-    init_pytorch_worker(rank, world_size, uid)
-
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
-
-    local_src = torch.tensor_split(src, world_size)[rank]
-    local_dst = torch.tensor_split(dst, world_size)[rank]
-
-    train_nid = torch.tensor(ix)
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph((src, dst))
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=True)
-    cugraph_g.add_nodes(9)
-    cugraph_g.add_edges(u=local_src, v=local_dst)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("ix", [[1], [1, 0]])
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_same_homogeneousgraph_results_mg(ix, batch_size):
-    uid = cugraph_comms_create_unique_id()
-    # Limit the number of GPUs this rest is run with
-    world_size = min(torch.cuda.device_count(), 4)
-
-    torch.multiprocessing.spawn(
-        run_test_same_homogeneousgraph_results,
-        args=(world_size, uid, ix, batch_size),
-        nprocs=world_size,
-    )
-
-
-def run_test_dataloader_biased_homogeneous(rank, world_size, uid):
-    init_pytorch_worker(rank, world_size, uid, True)
-
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]) + (rank * 9)
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1]) + (rank * 9)
-    wgt = torch.tensor(
-        [0.1, 0.1, 0.2, 0, 0, 0, 0.2, 0.1] * world_size, dtype=torch.float32
-    )
-
-    train_nid = torch.tensor([0, 1]) + (rank * 9)
-    # Create a heterograph with 3 node types and 3 edge types.
-    dgl_g = dgl.graph((src, dst))
-    dgl_g.edata["wgt"] = wgt[:8]
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=True)
-    cugraph_g.add_nodes(9 * world_size)
-    cugraph_g.add_edges(u=src, v=dst, data={"wgt": wgt})
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [4], batch_size=2, prob_attr="wgt")
-    cugraph_output = sample_cugraph_dgl_graphs(
-        cugraph_g, train_nid, [4], batch_size=2, prob_attr="wgt"
-    )
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-    assert 5 == cugraph_output[0]["blocks"][0].num_edges()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_biased_homogeneous_mg():
-    uid = cugraph_comms_create_unique_id()
-    # Limit the number of GPUs this test is run with
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_dataloader_biased_homogeneous,
-        args=(world_size, uid),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
deleted file mode 100644
index 5db443dc0d8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    import cugraph_dgl
-
-    del cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from dgl.dataloading import MultiLayerNeighborSampler
-import dgl
-import torch
-import cudf
-import pandas as pd
-import cupy as cp
-import numpy as np
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    create_homogeneous_sampled_graphs_from_dataframe,
-)
-
-
-def get_edge_df_from_homogenous_block(block):
-    block = block.to("cpu")
-    src, dst, eid = block.edges("all")
-    src = block.srcdata[dgl.NID][src]
-    dst = block.dstdata[dgl.NID][dst]
-    eid = block.edata[dgl.EID][eid]
-    df = pd.DataFrame({"src": src, "dst": dst, "eid": eid})
-    return df.sort_values(by="eid").reset_index(drop=True)
-
-
-def create_dgl_mfgs(g, seed_nodes, fanout):
-    sampler = MultiLayerNeighborSampler(fanout)
-    return sampler.sample_blocks(g, seed_nodes)
-
-
-def create_cugraph_dgl_homogenous_mfgs(dgl_blocks, return_type):
-    df_ls = []
-    unique_vertices_ls = []
-    for hop_id, block in enumerate(reversed(dgl_blocks)):
-        block = block.to("cpu")
-        src, dst, eid = block.edges("all")
-        eid = block.edata[dgl.EID][eid]
-
-        og_src = block.srcdata[dgl.NID][src]
-        og_dst = block.dstdata[dgl.NID][dst]
-        unique_vertices = pd.concat(
-            [pd.Series(og_dst.numpy()), pd.Series(og_src.numpy())]
-        ).drop_duplicates(keep="first")
-        unique_vertices_ls.append(unique_vertices)
-        df = cudf.DataFrame(
-            {
-                "sources": cp.asarray(src),
-                "destinations": cp.asarray(dst),
-                "edge_id": cp.asarray(eid),
-            }
-        )
-        df["hop_id"] = hop_id
-        df_ls.append(df)
-    df = cudf.concat(df_ls, ignore_index=True)
-    df["batch_id"] = 0
-
-    # Add map column
-    # to the dataframe
-    renumberd_map = pd.concat(unique_vertices_ls).drop_duplicates(keep="first").values
-    offsets = np.asarray([2, 2 + len(renumberd_map)])
-    map_ar = np.concatenate([offsets, renumberd_map])
-    map_ser = cudf.Series(map_ar)
-    # Have to reindex cause map_ser can be of larger length than df
-    df = df.reindex(df.index.union(map_ser.index))
-    df["map"] = map_ser
-    return create_homogeneous_sampled_graphs_from_dataframe(
-        df, return_type=return_type
-    )[0]
-
-
-@pytest.mark.parametrize("return_type", ["dgl.Block", "cugraph_dgl.nn.SparseGraph"])
-@pytest.mark.parametrize("seed_node", [3, 4, 5])
-def test_homogeneous_sampled_graphs_from_dataframe(return_type, seed_node):
-    g = dgl.graph(([0, 1, 2, 3, 4], [1, 2, 3, 4, 5]))
-    fanout = [1, 1, 1]
-    seed_node = torch.as_tensor([seed_node])
-
-    dgl_seed_nodes, dgl_output_nodes, dgl_mfgs = create_dgl_mfgs(g, seed_node, fanout)
-    (
-        cugraph_seed_nodes,
-        cugraph_output_nodes,
-        cugraph_mfgs,
-    ) = create_cugraph_dgl_homogenous_mfgs(dgl_mfgs, return_type=return_type)
-
-    np.testing.assert_equal(
-        cugraph_seed_nodes.cpu().numpy().copy().sort(),
-        dgl_seed_nodes.cpu().numpy().copy().sort(),
-    )
-
-    np.testing.assert_equal(
-        dgl_output_nodes.cpu().numpy().copy().sort(),
-        cugraph_output_nodes.cpu().numpy().copy().sort(),
-    )
-
-    if return_type == "dgl.Block":
-        for dgl_block, cugraph_dgl_block in zip(dgl_mfgs, cugraph_mfgs):
-            dgl_df = get_edge_df_from_homogenous_block(dgl_block)
-            cugraph_dgl_df = get_edge_df_from_homogenous_block(cugraph_dgl_block)
-            pd.testing.assert_frame_equal(dgl_df, cugraph_dgl_df)
-    else:
-        for dgl_block, cugraph_dgl_graph in zip(dgl_mfgs, cugraph_mfgs):
-            # Can not verify edge ids as they are not
-            # preserved in cugraph_dgl.nn.SparseGraph
-            assert dgl_block.num_src_nodes() == cugraph_dgl_graph.num_src_nodes()
-            assert dgl_block.num_dst_nodes() == cugraph_dgl_graph.num_dst_nodes()
-            dgl_offsets, dgl_indices, _ = dgl_block.adj_tensors("csc")
-            cugraph_offsets, cugraph_indices, _ = cugraph_dgl_graph.csc()
-            assert torch.equal(dgl_offsets.to("cpu"), cugraph_offsets.to("cpu"))
-            assert torch.equal(dgl_indices.to("cpu"), cugraph_indices.to("cpu"))
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
deleted file mode 100644
index de27efc6329..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import GATConv as CuGraphGATConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("residual", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_gatconv_equality(
-    dgl_graph_1,
-    mode,
-    idx_type,
-    max_in_degree,
-    num_heads,
-    residual,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import GATConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if mode == "bipartite":
-        in_feats = (10, 3)
-        nfeat = (
-            torch.randn(size[0], in_feats[0]).to(device),
-            torch.randn(size[1], in_feats[1]).to(device),
-        )
-    elif mode == "share_weights":
-        in_feats = 5
-        nfeat = (
-            torch.randn(size[0], in_feats).to(device),
-            torch.randn(size[1], in_feats).to(device),
-        )
-    else:
-        in_feats = 7
-        nfeat = torch.randn(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True, "residual": residual}
-
-    conv1 = GATConv(*args, **kwargs).to(device)
-    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
-
-    dim = num_heads * out_feats
-    with torch.no_grad():
-        conv2.attn_weights[:dim].copy_(conv1.attn_l.flatten())
-        conv2.attn_weights[dim:].copy_(conv1.attn_r.flatten())
-        if mode == "bipartite":
-            conv2.lin_src.weight.copy_(conv1.fc_src.weight)
-            conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
-        else:
-            conv2.lin.weight.copy_(conv1.fc.weight)
-        if residual and conv1.has_linear_res:
-            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
-
-    out1 = conv1(g, nfeat)
-    if sparse_format is not None:
-        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out1 = torch.randn_like(out1)
-    grad_out2 = grad_out1.detach().clone()
-    out1.backward(grad_out1)
-    out2.backward(grad_out2)
-
-    if mode == "bipartite":
-        assert torch.allclose(
-            conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-        )
-    else:
-        assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
-
-    if residual and conv1.has_linear_res:
-        assert torch.allclose(
-            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
-        )
-
-    assert torch.allclose(
-        torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
-        conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=1e-5,  # Note: using a loosened tolerance here due to numerical error
-    )
-
-
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_gatconv_edge_feats(
-    dgl_graph_1,
-    bias,
-    bipartite,
-    concat,
-    max_in_degree,
-    num_heads,
-    to_block,
-    use_edge_feats,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    if bipartite:
-        in_feats = (10, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
-        )
-    else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
-    out_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = CuGraphGATConv(
-        in_feats,
-        out_feats,
-        num_heads,
-        concat=concat,
-        edge_feats=edge_feats,
-        bias=bias,
-        allow_zero_in_degree=True,
-    ).to(device)
-    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
deleted file mode 100644
index 2d26b7fdc28..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-5
-
-
-@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("residual", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_gatv2conv_equality(
-    dgl_graph_1,
-    mode,
-    idx_type,
-    max_in_degree,
-    num_heads,
-    residual,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import GATv2Conv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if mode == "bipartite":
-        in_feats = (10, 3)
-        nfeat = (
-            torch.randn(size[0], in_feats[0]).to(device),
-            torch.randn(size[1], in_feats[1]).to(device),
-        )
-    elif mode == "share_weights":
-        in_feats = 5
-        nfeat = (
-            torch.randn(size[0], in_feats).to(device),
-            torch.randn(size[1], in_feats).to(device),
-        )
-    else:
-        in_feats = 7
-        nfeat = torch.randn(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    args = (in_feats, out_feats, num_heads)
-    kwargs = {
-        "bias": False,
-        "allow_zero_in_degree": True,
-        "residual": residual,
-        "share_weights": mode == "share_weights",
-    }
-
-    conv1 = GATv2Conv(*args, **kwargs).to(device)
-    conv2 = CuGraphGATv2Conv(*args, **kwargs).to(device)
-
-    with torch.no_grad():
-        conv2.attn_weights.copy_(conv1.attn.flatten())
-        conv2.lin_src.weight.copy_(conv1.fc_src.weight)
-        conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
-        if residual:
-            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
-
-    out1 = conv1(g, nfeat)
-    if sparse_format is not None:
-        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out1 = torch.randn_like(out1)
-    grad_out2 = grad_out1.detach().clone()
-    out1.backward(grad_out1)
-    out2.backward(grad_out2)
-
-    assert torch.allclose(
-        conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-    )
-
-    if residual:
-        assert torch.allclose(
-            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
-        )
-
-    assert torch.allclose(
-        conv1.attn.grad,
-        conv2.attn_weights.grad.view(1, num_heads, out_feats),
-        atol=ATOL,
-    )
-
-
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_gatv2conv_edge_feats(
-    dgl_graph_1,
-    bias,
-    bipartite,
-    concat,
-    max_in_degree,
-    num_heads,
-    to_block,
-    use_edge_feats,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    if bipartite:
-        in_feats = (10, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
-        )
-    else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
-    out_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = CuGraphGATv2Conv(
-        in_feats,
-        out_feats,
-        num_heads,
-        concat=concat,
-        edge_feats=edge_feats,
-        bias=bias,
-        allow_zero_in_degree=True,
-    ).to(device)
-    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
deleted file mode 100644
index b5d3686c609..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_bases", [1, 2, 5])
-@pytest.mark.parametrize("regularizer", [None, "basis"])
-@pytest.mark.parametrize("self_loop", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_relgraphconv_equality(
-    dgl_graph_1,
-    idx_type,
-    max_in_degree,
-    num_bases,
-    regularizer,
-    self_loop,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import RelGraphConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    in_feat, out_feat, num_rels = 10, 2, 3
-    args = (in_feat, out_feat, num_rels)
-    kwargs = {
-        "num_bases": num_bases,
-        "regularizer": regularizer,
-        "bias": False,
-        "self_loop": self_loop,
-    }
-
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size,
-            src_ids=g.edges()[0],
-            dst_ids=g.edges()[1],
-            values=g.edata[dgl.ETYPE],
-            formats="csc",
-        )
-    elif sparse_format == "csc":
-        offsets, indices, perm = g.adj_tensors("csc")
-        etypes = g.edata[dgl.ETYPE][perm]
-        sg = SparseGraph(
-            size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
-        )
-
-    conv1 = RelGraphConv(*args, **kwargs).to(device)
-    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).to(device)
-
-    with torch.no_grad():
-        if self_loop:
-            conv2.W[:-1].copy_(conv1.linear_r.W)
-            conv2.W[-1].copy_(conv1.loop_weight)
-        else:
-            conv2.W.copy_(conv1.linear_r.W)
-
-        if regularizer is not None:
-            conv2.coeff.copy_(conv1.linear_r.coeff)
-
-    out1 = conv1(g, feat, g.edata[dgl.ETYPE])
-
-    if sparse_format is not None:
-        out2 = conv2(sg, feat, sg.values(), max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.randn_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    end = -1 if self_loop else None
-    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=ATOL)
-
-    if self_loop:
-        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=ATOL)
-
-    if regularizer is not None:
-        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
deleted file mode 100644
index 3f1c2b1b3fe..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("aggr", ["mean", "pool"])
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_sageconv_equality(
-    dgl_graph_1, aggr, bias, bipartite, idx_type, max_in_degree, to_block, sparse_format
-):
-    from dgl.nn.pytorch import SAGEConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if bipartite:
-        in_feats = (5, 3)
-        feat = (
-            torch.rand(size[0], in_feats[0], requires_grad=True).to(device),
-            torch.rand(size[1], in_feats[1], requires_grad=True).to(device),
-        )
-    else:
-        in_feats = 5
-        feat = torch.rand(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    kwargs = {"aggregator_type": aggr, "bias": bias}
-    conv1 = SAGEConv(in_feats, out_feats, **kwargs).to(device)
-    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).to(device)
-
-    in_feats_src = conv2.in_feats_src
-    with torch.no_grad():
-        conv2.lin.weight[:, :in_feats_src].copy_(conv1.fc_neigh.weight)
-        conv2.lin.weight[:, in_feats_src:].copy_(conv1.fc_self.weight)
-        if bias:
-            conv2.lin.bias.copy_(conv1.fc_self.bias)
-        if aggr == "pool":
-            conv2.pre_lin.weight.copy_(conv1.fc_pool.weight)
-            conv2.pre_lin.bias.copy_(conv1.fc_pool.bias)
-
-    out1 = conv1(g, feat)
-    if sparse_format is not None:
-        out2 = conv2(sg, feat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, feat, max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.randn_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-    assert torch.allclose(
-        conv1.fc_neigh.weight.grad,
-        conv2.lin.weight.grad[:, :in_feats_src],
-        atol=ATOL,
-    )
-    assert torch.allclose(
-        conv1.fc_self.weight.grad,
-        conv2.lin.weight.grad[:, in_feats_src:],
-        atol=ATOL,
-    )
-    if bias:
-        assert torch.allclose(conv1.fc_self.bias.grad, conv2.lin.bias.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
deleted file mode 100644
index 09c0df202ff..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities.utils import import_optional
-from cugraph_dgl.nn import SparseGraph
-
-torch = import_optional("torch")
-
-
-def test_coo2csc(sparse_graph_1):
-    data = sparse_graph_1
-
-    g = SparseGraph(
-        size=data.size,
-        src_ids=data.src_ids,
-        dst_ids=data.dst_ids,
-        values=data.values,
-        formats=["csc"],
-    )
-    cdst_ids, src_ids, values = g.csc()
-
-    new = torch.sparse_csc_tensor(cdst_ids, src_ids, values).cuda()
-    old = torch.sparse_coo_tensor(
-        torch.vstack((data.src_ids, data.dst_ids)), data.values
-    ).cuda()
-    torch.allclose(new.to_dense(), old.to_dense())
-
-
-def test_csc_input(sparse_graph_1):
-    data = sparse_graph_1
-
-    g = SparseGraph(
-        size=data.size,
-        src_ids=data.src_ids_sorted_by_dst,
-        cdst_ids=data.cdst_ids,
-        values=data.values_csc,
-        formats=["coo", "csc", "csr"],
-    )
-    src_ids, dst_ids, values = g.coo()
-
-    new = torch.sparse_coo_tensor(torch.vstack((src_ids, dst_ids)), values).cuda()
-    old = torch.sparse_csc_tensor(
-        data.cdst_ids, data.src_ids_sorted_by_dst, data.values_csc
-    ).cuda()
-    torch.allclose(new.to_dense(), old.to_dense())
-
-    csrc_ids, dst_ids, values = g.csr()
-
-    new = torch.sparse_csr_tensor(csrc_ids, dst_ids, values).cuda()
-    torch.allclose(new.to_dense(), old.to_dense())
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
deleted file mode 100644
index 28d13dedec8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import TransformerConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("beta", [False, True])
-@pytest.mark.parametrize("bipartite_node_feats", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("num_heads", [1, 3, 4])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_transformerconv(
-    dgl_graph_1,
-    beta,
-    bipartite_node_feats,
-    concat,
-    idx_type,
-    num_heads,
-    to_block,
-    use_edge_feats,
-    sparse_format,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    if bipartite_node_feats:
-        in_node_feats = (5, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_node_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_node_feats[1], device=device),
-        )
-    else:
-        in_node_feats = 3
-        nfeat = torch.rand(g.num_src_nodes(), in_node_feats, device=device)
-    out_node_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = TransformerConv(
-        in_node_feats,
-        out_node_feats,
-        num_heads=num_heads,
-        concat=concat,
-        beta=beta,
-        edge_feats=edge_feats,
-    ).to(device)
-
-    if sparse_format is not None:
-        out = conv(sg, nfeat, efeat)
-    else:
-        out = conv(g, nfeat, efeat)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py b/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
deleted file mode 100644
index 0a99d4d65b7..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
-import cudf
-import numpy as np
-from cugraph_dgl import CuGraphStorage
-from .utils import assert_same_sampling_len
-
-th = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-@pytest.fixture()
-def dgl_graph():
-    graph_data = {
-        ("nt.a", "connects", "nt.b"): (
-            th.tensor([0, 1, 2]),
-            th.tensor([0, 1, 2]),
-        ),
-        ("nt.a", "connects", "nt.c"): (
-            th.tensor([0, 1, 2]),
-            th.tensor([0, 1, 2]),
-        ),
-        ("nt.c", "connects", "nt.c"): (
-            th.tensor([1, 3, 4, 5]),
-            th.tensor([0, 0, 0, 0]),
-        ),
-    }
-    g = dgl.heterograph(graph_data)
-    return g
-
-
-def test_cugraphstore_basic_apis():
-
-    num_nodes_dict = {"drug": 3, "gene": 2, "disease": 1}
-    # edges
-    drug_interacts_drug_df = cudf.DataFrame({"src": [0, 1], "dst": [1, 2]})
-    drug_interacts_gene = cudf.DataFrame({"src": [0, 1], "dst": [0, 1]})
-    drug_treats_disease = cudf.DataFrame({"src": [1], "dst": [0]})
-    data_dict = {
-        ("drug", "interacts", "drug"): drug_interacts_drug_df,
-        ("drug", "interacts", "gene"): drug_interacts_gene,
-        ("drug", "treats", "disease"): drug_treats_disease,
-    }
-    gs = CuGraphStorage(data_dict=data_dict, num_nodes_dict=num_nodes_dict)
-    # add node data
-    gs.add_node_data(
-        ntype="drug",
-        feat_name="node_feat",
-        feat_obj=th.as_tensor([0.1, 0.2, 0.3], dtype=th.float64),
-    )
-    # add edge data
-    gs.add_edge_data(
-        canonical_etype=("drug", "interacts", "drug"),
-        feat_name="edge_feat",
-        feat_obj=th.as_tensor([0.2, 0.4], dtype=th.float64),
-    )
-
-    assert gs.num_nodes() == 6
-
-    assert gs.num_edges(("drug", "interacts", "drug")) == 2
-    assert gs.num_edges(("drug", "interacts", "gene")) == 2
-    assert gs.num_edges(("drug", "treats", "disease")) == 1
-
-    node_feat = (
-        gs.get_node_storage(key="node_feat", ntype="drug")
-        .fetch([0, 1, 2])
-        .to("cpu")
-        .numpy()
-    )
-    np.testing.assert_equal(node_feat, np.asarray([0.1, 0.2, 0.3]))
-
-    edge_feat = (
-        gs.get_edge_storage(key="edge_feat", etype=("drug", "interacts", "drug"))
-        .fetch([0, 1])
-        .to("cpu")
-        .numpy()
-    )
-    np.testing.assert_equal(edge_feat, np.asarray([0.2, 0.4]))
-
-
-def test_sampling_heterograph(dgl_graph):
-    cugraph_gs = cugraph_dgl.cugraph_storage_from_heterograph(dgl_graph)
-
-    for fanout in [1, 2, 3, -1]:
-        for ntype in ["nt.a", "nt.b", "nt.c"]:
-            for d in ["in", "out"]:
-                assert_same_sampling_len(
-                    dgl_graph,
-                    cugraph_gs,
-                    nodes={ntype: [0]},
-                    fanout=fanout,
-                    edge_dir=d,
-                )
-
-
-def test_sampling_homogenous():
-    src_ar = np.asarray([0, 1, 2, 0, 1, 2, 7, 9, 10, 11], dtype=np.int32)
-    dst_ar = np.asarray([3, 4, 5, 6, 7, 8, 6, 6, 6, 6], dtype=np.int32)
-    g = dgl.heterograph({("a", "connects", "a"): (src_ar, dst_ar)})
-    cugraph_gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
-    # Convert to homogeneous
-    g = dgl.to_homogeneous(g)
-    nodes = [6]
-    # Test for multiple fanouts
-    for fanout in [1, 2, 3]:
-        exp_g = g.sample_neighbors(nodes, fanout=fanout)
-        cu_g = cugraph_gs.sample_neighbors(nodes, fanout=fanout)
-        exp_src, exp_dst = exp_g.edges()
-        cu_src, cu_dst = cu_g.edges()
-        assert len(exp_src) == len(cu_src)
-
-    # Test same results for all neighbours
-    exp_g = g.sample_neighbors(nodes, fanout=-1)
-    cu_g = cugraph_gs.sample_neighbors(nodes, fanout=-1)
-    exp_src, exp_dst = exp_g.edges()
-    exp_src, exp_dst = exp_src.numpy(), exp_dst.numpy()
-
-    cu_src, cu_dst = cu_g.edges()
-    cu_src, cu_dst = cu_src.to("cpu").numpy(), cu_dst.to("cpu").numpy()
-
-    # Assert same values sorted by src
-    exp_src_perm = exp_src.argsort()
-    exp_src = exp_src[exp_src_perm]
-    exp_dst = exp_dst[exp_src_perm]
-
-    cu_src_perm = cu_src.argsort()
-    cu_src = cu_src[cu_src_perm]
-    cu_dst = cu_dst[cu_src_perm]
-
-    np.testing.assert_equal(exp_dst, cu_dst)
-    np.testing.assert_equal(exp_src, cu_src)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
deleted file mode 100644
index 667a4a2e66d..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
-from .utils import (
-    assert_same_edge_feats,
-    assert_same_edge_feats_daskapi,
-    assert_same_node_feats,
-    assert_same_node_feats_daskapi,
-    assert_same_num_edges_can_etypes,
-    assert_same_num_edges_etypes,
-    assert_same_num_nodes,
-)
-
-th = import_optional("torch")
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-
-
-def create_heterograph1(idtype):
-    ctx = th.device("cuda")
-    graph_data = {
-        ("nt.a", "join.1", "nt.a"): (
-            F.tensor([0, 1, 2], dtype=idtype),
-            F.tensor([0, 1, 2], dtype=idtype),
-        ),
-        ("nt.a", "join.2", "nt.a"): (
-            F.tensor([0, 1, 2], dtype=idtype),
-            F.tensor([0, 1, 2], dtype=idtype),
-        ),
-    }
-    g = dgl.heterograph(graph_data, device=th.device("cuda"))
-    g.nodes["nt.a"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    return g
-
-
-def create_heterograph2(idtype):
-    ctx = th.device("cuda")
-
-    g = dgl.heterograph(
-        {
-            ("user", "plays", "game"): (
-                F.tensor([0, 1, 1, 2], dtype=idtype),
-                F.tensor([0, 0, 1, 1], dtype=idtype),
-            ),
-            ("developer", "develops", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-            ("developer", "tests", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-        },
-        idtype=idtype,
-        device=th.device("cuda"),
-    )
-
-    g.nodes["user"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["user"].data["p"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["game"].data["h"] = F.copy_to(F.tensor([2, 2], dtype=idtype), ctx=ctx)
-    g.nodes["developer"].data["h"] = F.copy_to(F.tensor([3, 3], dtype=idtype), ctx=ctx)
-    g.edges["plays"].data["h"] = F.copy_to(
-        F.tensor([1, 1, 1, 1], dtype=idtype), ctx=ctx
-    )
-    return g
-
-
-def create_heterograph3(idtype):
-    ctx = th.device("cuda")
-
-    g = dgl.heterograph(
-        {
-            ("user", "follows", "user"): (
-                F.tensor([0, 1, 1, 2, 2, 2], dtype=idtype),
-                F.tensor([0, 0, 1, 1, 2, 2], dtype=idtype),
-            ),
-            ("user", "plays", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-        },
-        idtype=idtype,
-        device=th.device("cuda"),
-    )
-    g.nodes["user"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["game"].data["h"] = F.copy_to(F.tensor([2, 2], dtype=idtype), ctx=ctx)
-    g.edges["follows"].data["h"] = F.copy_to(
-        F.tensor([10, 20, 30, 40, 50, 60], dtype=idtype), ctx=ctx
-    )
-    g.edges["follows"].data["p"] = F.copy_to(
-        F.tensor([1, 2, 3, 4, 5, 6], dtype=idtype), ctx=ctx
-    )
-    g.edges["plays"].data["h"] = F.copy_to(F.tensor([1, 2], dtype=idtype), ctx=ctx)
-    return g
-
-
-def create_heterograph4(idtype):
-    ctx = th.device("cuda")
-
-    g = dgl.heterograph(
-        {
-            ("user", "follows", "user"): (
-                F.tensor([1, 2], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-            ("user", "plays", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-        },
-        idtype=idtype,
-        device=th.device("cuda"),
-    )
-    g.nodes["user"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["game"].data["h"] = F.copy_to(F.tensor([2, 2], dtype=idtype), ctx=ctx)
-    g.edges["follows"].data["h"] = F.copy_to(F.tensor([1, 2], dtype=idtype), ctx=ctx)
-    g.edges["plays"].data["h"] = F.copy_to(F.tensor([1, 2], dtype=idtype), ctx=ctx)
-    return g
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_nodes_daskapi(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
-
-        assert_same_num_nodes(gs, g)
-        assert_same_node_feats_daskapi(gs, g)
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_edges_daskapi(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
-
-        assert_same_num_edges_can_etypes(gs, g)
-        assert_same_num_edges_etypes(gs, g)
-        assert_same_edge_feats_daskapi(gs, g)
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_nodes(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
-
-        assert_same_num_nodes(gs, g)
-        assert_same_node_feats(gs, g)
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_edges(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
-
-        assert_same_num_edges_can_etypes(gs, g)
-        assert_same_num_edges_etypes(gs, g)
-        assert_same_edge_feats(gs, g)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
deleted file mode 100644
index a60db97b8d6..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import cugraph_dgl
-import pylibcugraph
-import cupy
-import numpy as np
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_homogeneous_graph(direction):
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-    wgt = np.random.random((len(df),))
-
-    graph = cugraph_dgl.Graph()
-    num_nodes = max(df.src.max(), df.dst.max()) + 1
-    node_x = np.random.random((num_nodes,))
-
-    graph.add_nodes(
-        num_nodes, data={"num": torch.arange(num_nodes, dtype=torch.int64), "x": node_x}
-    )
-    graph.add_edges(df.src, df.dst, {"weight": wgt})
-    plc_dgl_graph = graph._graph(direction=direction)
-
-    assert graph.num_nodes() == num_nodes
-    assert graph.num_edges() == len(df)
-    assert graph.is_homogeneous
-    assert not graph.is_multi_gpu
-
-    assert (
-        graph.nodes() == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-
-    assert graph.nodes[None]["x"] is not None
-    assert (graph.nodes[None]["x"] == torch.as_tensor(node_x, device="cuda")).all()
-    assert (
-        graph.nodes[None]["num"]
-        == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-
-    assert (
-        graph.edges("eid", device="cuda")
-        == torch.arange(len(df), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (graph.edges[None]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
-
-    plc_expected_graph = pylibcugraph.SGGraph(
-        pylibcugraph.ResourceHandle(),
-        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
-        df.src if direction == "out" else df.dst,
-        df.dst if direction == "out" else df.src,
-        vertices_array=cupy.arange(num_nodes, dtype="int64"),
-    )
-
-    # Do the expensive check to make sure this test fails if an invalid
-    # graph is constructed.
-    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
-        pylibcugraph.ResourceHandle(),
-        plc_dgl_graph,
-        source_vertices=cupy.arange(num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
-        pylibcugraph.ResourceHandle(),
-        plc_expected_graph,
-        source_vertices=cupy.arange(num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    assert (v_actual == v_exp).all()
-    assert (d_in_actual == d_in_exp).all()
-    assert (d_out_actual == d_out_exp).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_heterogeneous_graph(direction):
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-
-    graph = cugraph_dgl.Graph()
-    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
-
-    num_nodes_group_1 = total_num_nodes // 2
-    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
-
-    node_x_1 = np.random.random((num_nodes_group_1,))
-    node_x_2 = np.random.random((num_nodes_group_2,))
-
-    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
-    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
-
-    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-
-    edges_12.dst -= num_nodes_group_1
-    edges_21.src -= num_nodes_group_1
-    edges_22.dst -= num_nodes_group_1
-    edges_22.src -= num_nodes_group_1
-
-    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
-    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
-    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
-    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
-
-    assert not graph.is_homogeneous
-    assert not graph.is_multi_gpu
-
-    # Verify graph.nodes()
-    assert (
-        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type1")
-        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type2")
-        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Verify graph.edges()
-    assert (
-        graph.edges("eid", etype=("type1", "e1", "type1"))
-        == torch.arange(len(edges_11), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type1", "e2", "type2"))
-        == torch.arange(len(edges_12), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e3", "type1"))
-        == torch.arange(len(edges_21), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e4", "type2"))
-        == torch.arange(len(edges_22), dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Use sampling call to check graph creation
-    # This isn't a test of cuGraph sampling with DGL; the options are
-    # set to verify the graph only.
-    plc_graph = graph._graph(direction)
-    sampling_output = pylibcugraph.uniform_neighbor_sample(
-        pylibcugraph.ResourceHandle(),
-        plc_graph,
-        start_list=cupy.arange(total_num_nodes, dtype="int64"),
-        h_fan_out=np.array([1, 1], dtype="int32"),
-        with_replacement=False,
-        do_expensive_check=True,
-        with_edge_properties=True,
-        prior_sources_behavior="exclude",
-        return_dict=True,
-    )
-
-    expected_etypes = {
-        0: "e1",
-        1: "e2",
-        2: "e3",
-        3: "e4",
-    }
-    expected_offsets = {
-        0: (0, 0),
-        1: (0, num_nodes_group_1),
-        2: (num_nodes_group_1, 0),
-        3: (num_nodes_group_1, num_nodes_group_1),
-    }
-    if direction == "in":
-        src_col = "minors"
-        dst_col = "majors"
-    else:
-        src_col = "majors"
-        dst_col = "minors"
-
-    # Looping over the output verifies that all edges are valid
-    # (and therefore, the graph is valid)
-    for i, etype in enumerate(sampling_output["edge_type"].tolist()):
-        eid = int(sampling_output["edge_id"][i])
-
-        srcs, dsts, eids = graph.edges(
-            "all", etype=expected_etypes[etype], device="cpu"
-        )
-
-        assert eids[eid] == eid
-        assert (
-            srcs[eid] == int(sampling_output[src_col][i]) - expected_offsets[etype][0]
-        )
-        assert (
-            dsts[eid] == int(sampling_output[dst_col][i]) - expected_offsets[etype][1]
-        )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
deleted file mode 100644
index eedda664c52..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-
-import cugraph_dgl
-import pylibcugraph
-import cupy
-import numpy as np
-
-import cudf
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph.gnn import (
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_get_raft_handle,
-)
-
-from .utils import init_pytorch_worker
-
-pylibwholegraph = import_optional("pylibwholegraph")
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-def run_test_graph_make_homogeneous_graph_mg(rank, uid, world_size, direction):
-    init_pytorch_worker(rank, world_size, uid, init_wholegraph=True)
-
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-    wgt = np.random.random((len(df),))
-
-    graph = cugraph_dgl.Graph(
-        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
-    )
-
-    # The number of nodes is set globally but features can have
-    # any distribution across workers as long as they are in order.
-    global_num_nodes = max(df.src.max(), df.dst.max()) + 1
-    node_x = np.array_split(np.arange(global_num_nodes, dtype="int64"), world_size)[
-        rank
-    ]
-
-    # Each worker gets a shuffled, permuted version of the edgelist
-    df = df.sample(frac=1.0)
-    df.src = (df.src + rank) % global_num_nodes
-    df.dst = (df.dst + rank + 1) % global_num_nodes
-
-    graph.add_nodes(global_num_nodes, data={"x": node_x})
-    graph.add_edges(df.src, df.dst, {"weight": wgt})
-    plc_dgl_graph = graph._graph(direction=direction)
-
-    assert graph.num_nodes() == global_num_nodes
-    assert graph.num_edges() == len(df) * world_size
-    assert graph.is_homogeneous
-    assert graph.is_multi_gpu
-
-    assert (
-        graph.nodes()
-        == torch.arange(global_num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-    ix = torch.arange(len(node_x) * rank, len(node_x) * (rank + 1), dtype=torch.int64)
-    assert graph.nodes[ix]["x"] is not None
-    assert (graph.nodes[ix]["x"] == torch.as_tensor(node_x, device="cuda")).all()
-
-    assert (
-        graph.edges("eid", device="cuda")
-        == torch.arange(world_size * len(df), dtype=torch.int64, device="cuda")
-    ).all()
-    ix = torch.arange(len(df) * rank, len(df) * (rank + 1), dtype=torch.int64)
-    assert (graph.edges[ix]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
-
-    plc_handle = pylibcugraph.ResourceHandle(
-        cugraph_comms_get_raft_handle().getHandle()
-    )
-
-    plc_expected_graph = pylibcugraph.MGGraph(
-        plc_handle,
-        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
-        [df.src] if direction == "out" else [df.dst],
-        [df.dst] if direction == "out" else [df.src],
-        vertices_array=[
-            cupy.array_split(cupy.arange(global_num_nodes, dtype="int64"), world_size)[
-                rank
-            ]
-        ],
-    )
-
-    # Do the expensive check to make sure this test fails if an invalid
-    # graph is constructed.
-    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
-        plc_handle,
-        plc_dgl_graph,
-        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
-        plc_handle,
-        plc_expected_graph,
-        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    assert (v_actual == v_exp).all()
-    assert (d_in_actual == d_in_exp).all()
-    assert (d_out_actual == d_out_exp).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
-)
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_homogeneous_graph_mg(direction):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_graph_make_homogeneous_graph_mg,
-        args=(
-            uid,
-            world_size,
-            direction,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_graph_make_heterogeneous_graph_mg(rank, uid, world_size, direction):
-    init_pytorch_worker(rank, world_size, uid)
-
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-
-    graph = cugraph_dgl.Graph(is_multi_gpu=True)
-    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
-
-    # Each worker gets a shuffled, permuted version of the edgelist
-    df = df.sample(frac=1.0)
-    df.src = (df.src + rank) % total_num_nodes
-    df.dst = (df.dst + rank + 1) % total_num_nodes
-
-    num_nodes_group_1 = total_num_nodes // 2
-    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
-
-    node_x_1 = np.array_split(np.random.random((num_nodes_group_1,)), world_size)[rank]
-    node_x_2 = np.array_split(np.random.random((num_nodes_group_2,)), world_size)[rank]
-
-    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
-    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
-
-    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-
-    edges_12.dst -= num_nodes_group_1
-    edges_21.src -= num_nodes_group_1
-    edges_22.dst -= num_nodes_group_1
-    edges_22.src -= num_nodes_group_1
-
-    total_edges_11 = torch.tensor(len(edges_11), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_11, torch.distributed.ReduceOp.SUM)
-    total_edges_12 = torch.tensor(len(edges_12), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_12, torch.distributed.ReduceOp.SUM)
-    total_edges_21 = torch.tensor(len(edges_21), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_21, torch.distributed.ReduceOp.SUM)
-    total_edges_22 = torch.tensor(len(edges_22), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_22, torch.distributed.ReduceOp.SUM)
-
-    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
-    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
-    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
-    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
-
-    assert not graph.is_homogeneous
-    assert graph.is_multi_gpu
-
-    # Verify graph.nodes()
-    assert (
-        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type1")
-        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type2")
-        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Verify graph.edges()
-    assert (
-        graph.edges("eid", etype=("type1", "e1", "type1"))
-        == torch.arange(total_edges_11, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type1", "e2", "type2"))
-        == torch.arange(total_edges_12, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e3", "type1"))
-        == torch.arange(total_edges_21, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e4", "type2"))
-        == torch.arange(total_edges_22, dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Use sampling call to check graph creation
-    # This isn't a test of cuGraph sampling with DGL; the options are
-    # set to verify the graph only.
-    plc_graph = graph._graph(direction)
-    assert isinstance(plc_graph, pylibcugraph.MGGraph)
-    sampling_output = pylibcugraph.uniform_neighbor_sample(
-        graph._resource_handle,
-        plc_graph,
-        start_list=cupy.arange(total_num_nodes, dtype="int64"),
-        batch_id_list=cupy.full(total_num_nodes, rank, dtype="int32"),
-        label_list=cupy.arange(world_size, dtype="int32"),
-        label_to_output_comm_rank=cupy.arange(world_size, dtype="int32"),
-        h_fan_out=np.array([-1], dtype="int32"),
-        with_replacement=False,
-        do_expensive_check=True,
-        with_edge_properties=True,
-        prior_sources_behavior="exclude",
-        return_dict=True,
-    )
-
-    sdf = cudf.DataFrame(
-        {
-            "majors": sampling_output["majors"],
-            "minors": sampling_output["minors"],
-            "edge_id": sampling_output["edge_id"],
-            "edge_type": sampling_output["edge_type"],
-        }
-    )
-
-    expected_offsets = {
-        0: (0, 0),
-        1: (0, num_nodes_group_1),
-        2: (num_nodes_group_1, 0),
-        3: (num_nodes_group_1, num_nodes_group_1),
-    }
-    if direction == "in":
-        src_col = "minors"
-        dst_col = "majors"
-    else:
-        src_col = "majors"
-        dst_col = "minors"
-
-    edges_11["etype"] = 0
-    edges_12["etype"] = 1
-    edges_21["etype"] = 2
-    edges_22["etype"] = 3
-
-    cdf = cudf.concat([edges_11, edges_12, edges_21, edges_22])
-    for i in range(len(cdf)):
-        row = cdf.iloc[i]
-        etype = row["etype"]
-        src = row["src"] + expected_offsets[etype][0]
-        dst = row["dst"] + expected_offsets[etype][1]
-
-        f = sdf[
-            (sdf[src_col] == src) & (sdf[dst_col] == dst) & (sdf["edge_type"] == etype)
-        ]
-        assert len(f) > 0  # may be multiple, some could be on other GPU
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
-)
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_heterogeneous_graph_mg(direction):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_graph_make_heterogeneous_graph_mg,
-        args=(
-            uid,
-            world_size,
-            direction,
-        ),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py b/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
deleted file mode 100644
index 4be66758b43..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import cupy as cp
-import numpy as np
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    cast_to_tensor,
-    _get_renumber_map,
-    _split_tensor,
-    _get_tensor_d_from_sampled_df,
-    create_homogeneous_sampled_graphs_from_dataframe,
-    _get_source_destination_range,
-    _create_homogeneous_cugraph_dgl_nn_sparse_graph,
-    create_homogeneous_sampled_graphs_from_dataframe_csc,
-)
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-cugraph_dgl = import_optional("cugraph_dgl")
-
-
-def test_casting_empty_array():
-    ar = cp.zeros(shape=0, dtype=cp.int32)
-    ser = cudf.Series(ar)
-    output_tensor = cast_to_tensor(ser)
-    assert output_tensor.dtype == torch.int32
-
-
-def get_dummy_sampled_df():
-    df = cudf.DataFrame()
-    df["sources"] = [0, 0, 1, 0, 0, 1, 0, 0, 2] + [np.nan] * 4
-    df["destinations"] = [1, 2, 0, 1, 2, 1, 2, 0, 1] + [np.nan] * 4
-    df["batch_id"] = [0, 0, 0, 1, 1, 1, 2, 2, 2] + [np.nan] * 4
-    df["hop_id"] = [0, 1, 1, 0, 1, 1, 0, 1, 1] + [np.nan] * 4
-    df["map"] = [4, 7, 10, 13, 10, 11, 12, 13, 14, 15, 16, 17, 18]
-    df = df.astype("int32")
-    df["hop_id"] = df["hop_id"].astype("uint8")
-    df["map"] = df["map"].astype("int64")
-    return df
-
-
-def get_dummy_sampled_df_csc():
-    df_dict = dict(
-        minors=np.array(
-            [1, 1, 2, 1, 0, 3, 1, 3, 2, 3, 2, 4, 0, 1, 1, 0, 3, 2], dtype=np.int32
-        ),
-        major_offsets=np.arange(19, dtype=np.int64),
-        map=np.array(
-            [26, 29, 33, 22, 23, 32, 18, 29, 33, 33, 8, 30, 32], dtype=np.int32
-        ),
-        renumber_map_offsets=np.array([0, 4, 9, 13], dtype=np.int64),
-        label_hop_offsets=np.array([0, 1, 3, 6, 7, 9, 13, 14, 16, 18], dtype=np.int64),
-    )
-
-    # convert values to Series so that NaNs are padded automatically
-    return cudf.DataFrame({k: cudf.Series(v) for k, v in df_dict.items()})
-
-
-def test_get_renumber_map():
-
-    sampled_df = get_dummy_sampled_df()
-
-    df, renumber_map, renumber_map_batch_indices = _get_renumber_map(sampled_df)
-
-    # Ensure that map was dropped
-    assert "map" not in df.columns
-
-    expected_map = torch.as_tensor(
-        [10, 11, 12, 13, 14, 15, 16, 17, 18], dtype=torch.int32, device="cuda"
-    )
-    assert torch.equal(renumber_map, expected_map)
-
-    expected_batch_indices = torch.as_tensor([3, 6], dtype=torch.int32, device="cuda")
-    assert torch.equal(renumber_map_batch_indices, expected_batch_indices)
-
-    # Ensure we dropped the Nans for rows  corresponding to the renumber_map
-    assert len(df) == 9
-
-    t_ls = _split_tensor(renumber_map, renumber_map_batch_indices)
-    assert torch.equal(
-        t_ls[0], torch.as_tensor([10, 11, 12], dtype=torch.int64, device="cuda")
-    )
-    assert torch.equal(
-        t_ls[1], torch.as_tensor([13, 14, 15], dtype=torch.int64, device="cuda")
-    )
-    assert torch.equal(
-        t_ls[2], torch.as_tensor([16, 17, 18], dtype=torch.int64, device="cuda")
-    )
-
-
-def test_get_tensor_d_from_sampled_df():
-    df = get_dummy_sampled_df()
-    tensor_d = _get_tensor_d_from_sampled_df(df)
-
-    expected_maps = {}
-    expected_maps[0] = torch.as_tensor([10, 11, 12], dtype=torch.int64, device="cuda")
-    expected_maps[1] = torch.as_tensor([13, 14, 15], dtype=torch.int64, device="cuda")
-    expected_maps[2] = torch.as_tensor([16, 17, 18], dtype=torch.int64, device="cuda")
-
-    for batch_id, batch_td in tensor_d.items():
-        batch_df = df[df["batch_id"] == batch_id]
-        for hop_id, hop_t in batch_td.items():
-            if hop_id != "map":
-                hop_df = batch_df[batch_df["hop_id"] == hop_id]
-                assert torch.equal(hop_t["sources"], cast_to_tensor(hop_df["sources"]))
-                assert torch.equal(
-                    hop_t["destinations"], cast_to_tensor(hop_df["destinations"])
-                )
-
-        assert torch.equal(batch_td["map"], expected_maps[batch_id])
-
-
-def test_create_homogeneous_sampled_graphs_from_dataframe():
-    sampler = dgl.dataloading.MultiLayerNeighborSampler([2, 2])
-    g = dgl.graph(([0, 10, 20], [0, 0, 10])).to("cuda")
-    dgl_input_nodes, dgl_output_nodes, dgl_blocks = sampler.sample_blocks(
-        g, torch.as_tensor([0]).to("cuda")
-    )
-
-    # Directions are reversed in dgl
-    s1, d1 = dgl_blocks[0].edges()
-    s0, d0 = dgl_blocks[1].edges()
-    srcs = cp.concatenate([cp.asarray(s0), cp.asarray(s1)])
-    dsts = cp.concatenate([cp.asarray(d0), cp.asarray(d1)])
-
-    nids = dgl_blocks[0].srcdata[dgl.NID]
-    nids = cp.concatenate(
-        [cp.asarray([2]), cp.asarray([len(nids) + 2]), cp.asarray(nids)]
-    )
-
-    df = cudf.DataFrame()
-    df["sources"] = srcs
-    df["destinations"] = dsts
-    df["hop_id"] = [0] * len(s0) + [1] * len(s1)
-    df["batch_id"] = 0
-    df["map"] = nids
-
-    (
-        cugraph_input_nodes,
-        cugraph_output_nodes,
-        cugraph_blocks,
-    ) = create_homogeneous_sampled_graphs_from_dataframe(df)[0]
-
-    assert torch.equal(dgl_input_nodes, cugraph_input_nodes)
-    assert torch.equal(dgl_output_nodes, cugraph_output_nodes)
-
-    for c_block, d_block in zip(cugraph_blocks, dgl_blocks):
-        ce, cd = c_block.edges()
-        de, dd = d_block.edges()
-        assert torch.equal(ce, de)
-        assert torch.equal(cd, dd)
-
-
-def test_get_source_destination_range():
-    df = get_dummy_sampled_df()
-    output_d = _get_source_destination_range(df)
-
-    expected_output = {
-        (0, 0): {"sources_range": 0, "destinations_range": 1},
-        (0, 1): {"sources_range": 1, "destinations_range": 2},
-        (1, 0): {"sources_range": 0, "destinations_range": 1},
-        (1, 1): {"sources_range": 1, "destinations_range": 2},
-        (2, 0): {"sources_range": 0, "destinations_range": 2},
-        (2, 1): {"sources_range": 2, "destinations_range": 1},
-    }
-
-    assert output_d == expected_output
-
-
-def test__create_homogeneous_cugraph_dgl_nn_sparse_graph():
-    tensor_d = {
-        "sources_range": 1,
-        "destinations_range": 2,
-        "sources": torch.as_tensor([0, 0, 1, 1], dtype=torch.int64, device="cuda"),
-        "destinations": torch.as_tensor([0, 0, 1, 2], dtype=torch.int64, device="cuda"),
-    }
-
-    seednodes_range = 10
-    sparse_graph = _create_homogeneous_cugraph_dgl_nn_sparse_graph(
-        tensor_d, seednodes_range
-    )
-    assert sparse_graph.num_src_nodes() == 2
-    assert sparse_graph.num_dst_nodes() == seednodes_range + 1
-    assert isinstance(sparse_graph, cugraph_dgl.nn.SparseGraph)
-
-
-def test_create_homogeneous_sampled_graphs_from_dataframe_csc():
-    df = get_dummy_sampled_df_csc()
-    batches = create_homogeneous_sampled_graphs_from_dataframe_csc(df)
-
-    assert len(batches) == 3
-    assert torch.equal(batches[0][0], torch.IntTensor([26, 29, 33, 22]).cuda())
-    assert torch.equal(batches[1][0], torch.IntTensor([23, 32, 18, 29, 33]).cuda())
-    assert torch.equal(batches[2][0], torch.IntTensor([33, 8, 30, 32]).cuda())
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/utils.py b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
deleted file mode 100644
index fa4eb05f297..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/utils.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import cugraph_comms_init
-
-th = import_optional("torch")
-
-
-def assert_same_node_feats_daskapi(gs, g):
-    assert set(gs.ndata.keys()) == set(g.ndata.keys())
-
-    for key in g.ndata.keys():
-        for ntype in g.ntypes:
-            indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype).cuda()
-            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
-                g_output = g.get_node_storage(key=key, ntype=ntype).fetch(
-                    indices, device="cuda"
-                )
-                gs_output = gs.get_node_storage(key=key, ntype=ntype).fetch(indices)
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_node_feats(gs, g):
-    assert set(gs.ndata.keys()) == set(g.ndata.keys())
-    assert set(gs.ntypes) == set(g.ntypes)
-
-    for key in g.ndata.keys():
-        for ntype in g.ntypes:
-            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
-                indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype)
-
-                g_output = g.ndata[key]
-                gs_output = gs.ndata[key]
-
-                if len(g.ntypes) > 1:
-                    g_output = g_output[ntype]
-                    gs_output = gs_output[ntype]
-
-                g_output = g_output[indices]
-                gs_output = gs_output[indices]
-
-                equal_t = (gs_output != g_output).sum()
-                assert equal_t == 0
-
-
-def assert_same_num_nodes(gs, g):
-    for ntype in g.ntypes:
-        assert g.num_nodes(ntype) == gs.num_nodes(ntype)
-
-
-def assert_same_num_edges_can_etypes(gs, g):
-    for can_etype in g.canonical_etypes:
-        assert g.num_edges(can_etype) == gs.num_edges(can_etype)
-
-
-def assert_same_num_edges_etypes(gs, g):
-    for etype in g.etypes:
-        assert g.num_edges(etype) == gs.num_edges(etype)
-
-
-def assert_same_edge_feats_daskapi(gs, g):
-    assert set(gs.edata.keys()) == set(g.edata.keys())
-    for key in g.edata.keys():
-        for etype in g.canonical_etypes:
-            indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
-            if len(g.etypes) <= 1 or etype in g.edata[key]:
-                g_output = g.get_edge_storage(key=key, etype=etype).fetch(
-                    indices, device="cuda"
-                )
-                gs_output = gs.get_edge_storage(key=key, etype=etype).fetch(indices)
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_edge_feats(gs, g):
-    assert set(gs.edata.keys()) == set(g.edata.keys())
-    assert set(gs.canonical_etypes) == set(g.canonical_etypes)
-    assert set(gs.etypes) == set(g.etypes)
-
-    for key in g.edata.keys():
-        for etype in g.canonical_etypes:
-            if len(g.etypes) <= 1 or etype in g.edata[key]:
-                indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
-                g_output = g.edata[key]
-                gs_output = gs.edata[key]
-
-                if len(g.etypes) > 1:
-                    g_output = g_output[etype]
-                    gs_output = gs_output[etype]
-
-                g_output = g_output[indices]
-                gs_output = gs_output[indices]
-
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_sampling_len(dgl_g, cugraph_gs, nodes, fanout, edge_dir):
-    dgl_o = dgl_g.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
-    cugraph_o = cugraph_gs.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
-    assert cugraph_o.num_edges() == dgl_o.num_edges()
-    for etype in dgl_o.canonical_etypes:
-        assert dgl_o.num_edges(etype) == cugraph_o.num_edges(etype)
-
-
-def init_pytorch_worker(rank, world_size, cugraph_id, init_wholegraph=False):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    th.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    th.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    if init_wholegraph:
-        import pylibwholegraph
-
-        pylibwholegraph.torch.initialize.init(
-            rank,
-            world_size,
-            rank,
-            world_size,
-        )
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
diff --git a/python/cugraph-dgl/cugraph_dgl/typing.py b/python/cugraph-dgl/cugraph_dgl/typing.py
deleted file mode 100644
index a68463c3fd9..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/typing.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Union, Tuple
-from cugraph.utilities.utils import import_optional
-
-from cugraph_dgl.nn import SparseGraph
-
-import pandas
-import numpy
-import cupy
-import cudf
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-TensorType = Union[
-    "torch.Tensor",
-    "cupy.ndarray",
-    "numpy.ndarray",
-    "cudf.Series",
-    "pandas.Series",
-    List[int],
-]
-
-DGLSamplerOutput = Tuple[
-    "torch.Tensor",
-    "torch.Tensor",
-    List[Union["dgl.Block", SparseGraph]],
-]
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/__init__.py b/python/cugraph-dgl/cugraph_dgl/utils/__init__.py
deleted file mode 100644
index 081b2ae8260..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
deleted file mode 100644
index 2ba04bd916f..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Utils to convert b/w dgl heterograph to cugraph GraphStore
-from __future__ import annotations
-from typing import Dict, Tuple, Union
-
-from cugraph_dgl.typing import TensorType
-
-import cudf
-import pandas as pd
-import dask.dataframe as dd
-import dask_cudf
-from dask.distributed import get_client
-import cupy as cp
-import numpy as np
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn.dgl_extensions.dgl_uniform_sampler import src_n, dst_n
-
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-torch = import_optional("torch")
-
-
-# Feature Tensor to DataFrame Utils
-def convert_to_column_major(t: torch.Tensor):
-    return t.t().contiguous().t()
-
-
-def create_ar_from_tensor(t: torch.Tensor):
-    t = convert_to_column_major(t)
-    if t.device.type == "cuda":
-        ar = cp.asarray(t)
-    else:
-        ar = t.numpy()
-    return ar
-
-
-def _create_edge_frame(src_t: torch.Tensor, dst_t: torch.Tensor, single_gpu: bool):
-    """
-    Create a edge dataframe from src_t and dst_t
-    """
-    src_ar = create_ar_from_tensor(src_t)
-    dst_ar = create_ar_from_tensor(dst_t)
-    edge_df = _create_df_from_edge_ar(src_ar, dst_ar, single_gpu=single_gpu)
-    edge_df = edge_df.rename(
-        columns={edge_df.columns[0]: src_n, edge_df.columns[1]: dst_n}
-    )
-    return edge_df
-
-
-def _create_df_from_edge_ar(src_ar, dst_ar, single_gpu=True):
-    if not single_gpu:
-        nworkers = len(get_client().scheduler_info()["workers"])
-        npartitions = nworkers * 1
-    if single_gpu:
-        df = cudf.DataFrame(data={src_n: src_ar, dst_n: dst_ar})
-    else:
-        if isinstance(src_ar, cp.ndarray):
-            src_ar = src_ar.get()
-        if isinstance(dst_ar, cp.ndarray):
-            dst_ar = dst_ar.get()
-
-        df = pd.DataFrame(data={src_n: src_ar, dst_n: dst_ar})
-        # Only save stuff in host memory
-        df = dd.from_pandas(df, npartitions=npartitions).persist()
-        df = df.map_partitions(cudf.DataFrame.from_pandas)
-
-    df = df.reset_index(drop=True)
-    return df
-
-
-def get_edges_dict_from_dgl_HeteroGraph(
-    graph: dgl.DGLHeteroGraph, single_gpu: bool
-) -> Dict[Tuple[str, str, str], Union[cudf.DataFrame, dask_cudf.DataFrame]]:
-    etype_d = {}
-    for can_etype in graph.canonical_etypes:
-        src_t, dst_t = graph.edges(form="uv", etype=can_etype)
-        etype_d[can_etype] = _create_edge_frame(src_t, dst_t, single_gpu)
-    return etype_d
-
-
-def add_ndata_from_dgl_HeteroGraph(gs, g):
-    for feat_name, feat in g.ndata.items():
-        if isinstance(feat, torch.Tensor):
-            assert len(g.ntypes) == 1
-            ntype = g.ntypes[0]
-            gs.ndata_storage.add_data(
-                feat_name=feat_name, type_name=ntype, feat_obj=feat
-            )
-        else:
-            for ntype, feat_t in feat.items():
-                gs.ndata_storage.add_data(
-                    feat_name=feat_name, type_name=ntype, feat_obj=feat_t
-                )
-
-
-def add_edata_from_dgl_HeteroGraph(gs, g):
-    for feat_name, feat in g.edata.items():
-        if isinstance(feat, torch.Tensor):
-            assert len(g.etypes) == 1
-            etype = g.etypes[0]
-            gs.edata_storage.add_data(
-                feat_name=feat_name, type_name=etype, feat_obj=feat
-            )
-        else:
-            for etype, feat_t in feat.items():
-                gs.edata_storage.add_data(
-                    feat_name=feat_name, type_name=etype, feat_obj=feat_t
-                )
-
-
-def _cast_to_torch_tensor(t: TensorType) -> "torch.Tensor":
-    if isinstance(t, torch.Tensor):
-        return t
-    elif isinstance(t, (cp.ndarray, cudf.Series)):
-        return torch.as_tensor(t, device="cuda")
-    elif isinstance(t, (pd.Series, np.ndarray)):
-        return torch.as_tensor(t, device="cpu")
-    return torch.as_tensor(t)
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_storage_utils.py b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_storage_utils.py
deleted file mode 100644
index cc23aa910a5..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_storage_utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from cugraph.gnn.dgl_extensions.utils.sampling import eid_n, src_n, dst_n
-from cugraph.utilities.utils import import_optional, MissingModule
-
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-
-
-def _assert_valid_canonical_etype(canonical_etype):
-    if not _is_valid_canonical_etype:
-        error_message = (
-            f"Invalid canonical_etype {canonical_etype} "
-            + "canonical etype should be is a string triplet (str, str, str)"
-            + "for source node type, edge type and destination node type"
-        )
-        raise dgl.DGLError(error_message)
-
-
-def _is_valid_canonical_etype(canonical_etype):
-    if not isinstance(canonical_etype, tuple):
-        return False
-
-    if len(canonical_etype) != 3:
-        return False
-
-    for t in canonical_etype:
-        if not isinstance(t, str):
-            return False
-    return True
-
-
-def add_edge_ids_to_edges_dict(edge_data_dict, edge_id_offset_d, id_dtype):
-    eids_data_dict = {}
-    for etype, df in edge_data_dict.items():
-        # Do not modify input by user
-        if len(df.columns) != 2:
-            raise ValueError(
-                "Provided dataframe in edge_dict contains more than 2 columns",
-                "DataFrame with only 2 columns is supported",
-                "Where first is treated as src and second as dst",
-            )
-        df = df.copy(deep=False)
-        df = df.rename(columns={df.columns[0]: src_n, df.columns[1]: dst_n})
-        df[eid_n] = id_dtype(1)
-        df[eid_n] = df[eid_n].cumsum()
-        df[eid_n] = df[eid_n] + edge_id_offset_d[etype] - 1
-        df[eid_n] = df[eid_n].astype(id_dtype)
-        eids_data_dict[etype] = df
-    return eids_data_dict
-
-
-def add_node_offset_to_edges_dict(edge_data_dict, node_id_offset_d):
-    for etype, df in edge_data_dict.items():
-        src_type, _, dst_type = etype
-        df[src_n] = df[src_n] + node_id_offset_d[src_type]
-        df[dst_n] = df[dst_n] + node_id_offset_d[dst_type]
-    return edge_data_dict
-
-
-if isinstance(F, MissingModule):
-    backend_dtype_to_np_dtype_dict = MissingModule("dgl")
-else:
-    backend_dtype_to_np_dtype_dict = {
-        F.bool: bool,
-        F.uint8: np.uint8,
-        F.int8: np.int8,
-        F.int16: np.int16,
-        F.int32: np.int32,
-        F.int64: np.int64,
-        F.float16: np.float16,
-        F.float32: np.float32,
-        F.float64: np.float64,
-    }
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/feature_storage.py b/python/cugraph-dgl/cugraph_dgl/utils/feature_storage.py
deleted file mode 100644
index 31917661557..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/feature_storage.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from cugraph.gnn import FeatureStore
-from cugraph.utilities.utils import import_optional
-
-torch = import_optional("torch")
-
-
-class dgl_FeatureStorage:
-    """
-    Storage for node/edge feature data.
-    """
-
-    def __init__(self, fs: FeatureStore, type_name: str, feat_name: str):
-        self.fs = fs
-        self.type_name = type_name
-        self.feat_name = feat_name
-
-    def fetch(self, indices, device=None, pin_memory=False, **kwargs):
-        """Fetch the features of the given node/edge IDs to the
-        given device.
-        Parameters
-        ----------
-        indices : Tensor
-            Node or edge IDs.
-        device : Device
-            Device context.
-        pin_memory : bool
-            Wether to use pin_memory for fetching features
-            pin_memory=True is currently not supported
-
-        Returns
-        -------
-        Tensor
-            Feature data stored in PyTorch Tensor.
-        """
-        if pin_memory:
-            raise ValueError("pinned memory not supported in dgl_FeatureStorage")
-        if isinstance(indices, torch.Tensor):
-            indices = indices.long()
-        t = self.fs.get_data(
-            indices=indices, type_name=self.type_name, feat_name=self.feat_name
-        )
-        if device:
-            return t.to(device)
-        else:
-            return t
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
deleted file mode 100644
index 4de9406be07..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import warnings
-
-from collections import defaultdict
-from collections.abc import MutableMapping
-from typing import Union, Dict, List, Tuple
-
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl.typing import TensorType
-from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-class EmbeddingView:
-    def __init__(self, storage: "dgl.storages.base.FeatureStorage", ld: int):
-        self.__ld = ld
-        self.__storage = storage
-
-    def __getitem__(self, u: TensorType) -> "torch.Tensor":
-        u = _cast_to_torch_tensor(u)
-        try:
-            return self.__storage.fetch(
-                u,
-                "cuda",
-            )
-        except RuntimeError as ex:
-            warnings.warn(
-                "Got error accessing data, trying again with index on device: "
-                + str(ex)
-            )
-            return self.__storage.fetch(
-                u.cuda(),
-                "cuda",
-            )
-
-    @property
-    def shape(self) -> "torch.Size":
-        try:
-            f = self.__storage.fetch(torch.tensor([0]), "cpu")
-        except RuntimeError:
-            f = self.__storage.fetch(torch.tensor([0], device="cuda"), "cuda")
-        sz = [s for s in f.shape]
-        sz[0] = self.__ld
-        return torch.Size(tuple(sz))
-
-
-class HeteroEdgeDataView(MutableMapping):
-    """
-    Duck-typed version of DGL's HeteroEdgeDataView.
-    Used for accessing and modifying edge features.
-    """
-
-    def __init__(
-        self,
-        graph: "cugraph_dgl.Graph",
-        etype: Union[Tuple[str, str, str], List[Tuple[str, str, str]]],
-        edges: TensorType,
-    ):
-        self.__graph = graph
-        self.__etype = etype
-        self.__edges = edges
-
-    @property
-    def _etype(self) -> Tuple[str, str, str]:
-        return self.__etype
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    @property
-    def _edges(self) -> TensorType:
-        return self.__edges
-
-    def __getitem__(self, key: str):
-        if isinstance(self._etype, list):
-            return {
-                t: self._graph._get_e_emb(t, key, self._edges)
-                for t in self._etype
-                if self._graph._has_e_emb(t, key)
-            }
-
-        return self._graph._get_e_emb(self._etype, key, self._edges)
-
-    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
-        if isinstance(self._etype, list):
-            if not isinstance(val, dict):
-                raise ValueError(
-                    "There are multiple edge types in this view. "
-                    "Expected a dictionary of values."
-                )
-            for t, v in val.items():
-                if t not in self._etype:
-                    raise ValueError("Attempted to modify a type out of view.")
-                self._graph.set_e_emb(t, self._edges, {key: v})
-        else:
-            if isinstance(val, dict):
-                raise ValueError(
-                    "There is only one edge type in this view. "
-                    "Expected a single tensor."
-                )
-            self._graph.set_e_emb(self._etype, self._edges, {key: v})
-
-    def __delitem__(self, key: str):
-        if isinstance(self._etype, list):
-            for t in self._etype:
-                self._graph.pop_e_emb(t, key)
-        else:
-            self._graph.pop_e_emb(self._etype, key)
-
-    def _transpose(self, fetch_vals=True):
-        if isinstance(self._etype, list):
-            tr = defaultdict(dict)
-            for etype in self._etype:
-                for key in self._graph._get_e_emb_keys(etype):
-                    tr[key][etype] = (
-                        self._graph._get_e_emb(etype, key, self._edges)
-                        if fetch_vals
-                        else []
-                    )
-        else:
-            tr = {}
-            for key in self._graph._get_e_emb_keys(self._etype):
-                tr[key] = (
-                    self._graph._get_e_emb(self._etype, key, self._edges)
-                    if fetch_vals
-                    else []
-                )
-
-        return tr
-
-    def __len__(self):
-        return len(self._transpose(fetch_vals=False))
-
-    def __iter__(self):
-        return iter(self._transpose())
-
-    def keys(self):
-        return self._transpose(fetch_vals=False).keys()
-
-    def values(self):
-        return self._transpose().values()
-
-    def __repr__(self):
-        return repr(self._transpose(fetch_vals=False))
-
-
-class HeteroNodeDataView(MutableMapping):
-    """
-    Duck-typed version of DGL's HeteroNodeDataView.
-    Used for accessing and modifying node features.
-    """
-
-    def __init__(
-        self,
-        graph: "cugraph_dgl.Graph",
-        ntype: Union[str, List[str]],
-        nodes: TensorType,
-    ):
-        self.__graph = graph
-        self.__ntype = ntype
-        self.__nodes = nodes
-
-    @property
-    def _ntype(self) -> str:
-        return self.__ntype
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    @property
-    def _nodes(self) -> TensorType:
-        return self.__nodes
-
-    def __getitem__(self, key: str):
-        if isinstance(self._ntype, list):
-            return {
-                t: self._graph._get_n_emb(t, key, self._nodes)
-                for t in self._ntype
-                if self._graph._has_n_emb(t, key)
-            }
-        else:
-            return self._graph._get_n_emb(self._ntype, key, self._nodes)
-
-    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
-        if isinstance(self._ntype, list):
-            if not isinstance(val, dict):
-                raise ValueError(
-                    "There are multiple node types in this view. "
-                    "Expected a dictionary of values."
-                )
-            for t, v in val.items():
-                if t not in self._ntype:
-                    raise ValueError("Attempted to modify a type out of view.")
-                self._graph._set_n_emb(t, self._nodes, {key: v})
-        else:
-            if isinstance(val, dict):
-                raise ValueError(
-                    "There is only one node type in this view. "
-                    "Expected a single value tensor."
-                )
-            self._graph._set_n_emb(self._ntype, self._nodes, {key: val})
-
-    def __delitem__(self, key: str):
-        if isinstance(self._ntype, list):
-            for t in self._ntype:
-                self._graph._pop_n_emb(t, key)
-        else:
-            self._graph.pop_n_emb(self._ntype, key)
-
-    def _transpose(self, fetch_vals=True):
-        if isinstance(self._ntype, list):
-            tr = defaultdict(dict)
-            for ntype in self._ntype:
-                for key in self._graph._get_n_emb_keys(ntype):
-                    tr[key][ntype] = (
-                        self._graph._get_n_emb(ntype, key, self._nodes)
-                        if fetch_vals
-                        else []
-                    )
-        else:
-            tr = {}
-            for key in self._graph._get_n_emb_keys(self._ntype):
-                tr[key] = (
-                    self._graph._get_n_emb(self._ntype, key, self._nodes)
-                    if fetch_vals
-                    else []
-                )
-
-        return tr
-
-    def __len__(self):
-        return len(self._transpose(fetch_vals=False))
-
-    def __iter__(self):
-        return iter(self._transpose())
-
-    def keys(self):
-        return self._transpose(fetch_vals=False).keys()
-
-    def values(self):
-        return self._transpose().values()
-
-    def __repr__(self):
-        return repr(self._transpose(fetch_vals=False))
-
-
-class HeteroEdgeView:
-    """
-    Duck-typed version of DGL's HeteroEdgeView.
-    """
-
-    def __init__(self, graph):
-        self.__graph = graph
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    def __getitem__(self, key):
-        if isinstance(key, slice):
-            if not (key.start is None and key.stop is None and key.stop is None):
-                raise ValueError("Only full slices are supported in DGL.")
-            edges = dgl.base.ALL
-            etype = None
-        elif key is None:
-            edges = dgl.base.ALL
-            etype = None
-        elif isinstance(key, tuple):
-            if len(key) == 3:
-                edges = dgl.base.ALL
-                etype = key
-            else:
-                edges = key
-                etype = None
-        elif isinstance(key, str):
-            edges = dgl.base.ALL
-            etype = key
-        else:
-            edges = key
-            etype = None
-
-        return HeteroEdgeDataView(
-            graph=self.__graph,
-            etype=etype,
-            edges=edges,
-        )
-
-    def __call__(self, *args, **kwargs):
-        if "device" in kwargs:
-            return self.__graph.all_edges(*args, **kwargs)
-
-        return self.__graph.all_edges(*args, **kwargs, device="cuda")
-
-
-class HeteroNodeView:
-    """
-    Duck-typed version of DGL's HeteroNodeView.
-    """
-
-    def __init__(self, graph: "cugraph_dgl.Graph"):
-        self.__graph = graph
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    def __getitem__(self, key):
-        if isinstance(key, slice):
-            if not (key.start is None and key.stop is None and key.stop is None):
-                raise ValueError("Only full slices are supported in DGL.")
-            nodes = dgl.base.ALL
-            ntype = None
-        elif isinstance(key, tuple):
-            nodes, ntype = key
-        elif key is None or isinstance(key, str):
-            nodes = dgl.base.ALL
-            ntype = key
-        else:
-            nodes = key
-            ntype = None
-
-        return HeteroNodeDataView(graph=self.__graph, ntype=ntype, nodes=nodes)
-
-    def __call__(self, ntype=None):
-        return torch.arange(
-            0, self.__graph.num_nodes(ntype), dtype=self.__graph.idtype, device="cuda"
-        )
diff --git a/python/cugraph-dgl/examples/dataset_from_disk_cudf.ipynb b/python/cugraph-dgl/examples/dataset_from_disk_cudf.ipynb
deleted file mode 100644
index 15708f5dea6..00000000000
--- a/python/cugraph-dgl/examples/dataset_from_disk_cudf.ipynb
+++ /dev/null
@@ -1,269 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "bc952178-34c0-4f13-9003-478d4aa8cd4d",
-   "metadata": {},
-   "source": [
-    "# Testing Notebook for cugraph DGL vs DGL Upstream"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "d92a81b3-50ac-42ff-97e0-d636945f1f80",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"4\"\n",
-    "import cudf\n",
-    "import rmm\n",
-    "import torch\n",
-    "from rmm.allocators.torch import rmm_torch_allocator\n",
-    "rmm.reinitialize(initial_pool_size=15e9)\n",
-    "#Switch to async pool in case of memory issues due to fragmentation of the pool\n",
-    "#rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource(initial_pool_size=15e9))\n",
-    "torch.cuda.memory.change_current_allocator(rmm_torch_allocator)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "f304a5dd-1465-4054-846f-2308a19153fa",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "single_gpu = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "b6f899ee",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def load_dgl_dataset(dataset_name='ogbn-products'):\n",
-    "    from ogb.nodeproppred import DglNodePropPredDataset\n",
-    "    dataset_root = '/raid/vjawa/gnn/'\n",
-    "    dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)\n",
-    "    split_idx = dataset.get_idx_split()\n",
-    "    train_idx, valid_idx, test_idx = split_idx[\"train\"], split_idx[\"valid\"], split_idx[\"test\"]\n",
-    "    g, label = dataset[0]\n",
-    "    g.ndata['label'] = label\n",
-    "    g = g.add_self_loop()\n",
-    "    g = g.to('cpu')\n",
-    "    return g, train_idx"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fdd59d3a-0c1d-425f-a337-34b09c675622",
-   "metadata": {},
-   "source": [
-    "# cuGraph DGL DataLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "e1e84844-634e-451e-be74-939f9477562f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import cugraph_dgl\n",
-    "import tempfile"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "3808055c-2d7d-4cc7-b1bd-2fe9edd6eb95",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!rm -rf \"/raid/vjawa/obgn_products_sampling/\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "eff3d77b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "g, train_idx = load_dgl_dataset()\n",
-    "g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)\n",
-    "\n",
-    "batch_size = 1024*2\n",
-    "fanout_vals=[25, 25]\n",
-    "sampler = cugraph_dgl.dataloading.NeighborSampler(fanout_vals)\n",
-    "dataloader = cugraph_dgl.dataloading.DataLoader(\n",
-    "    g,                               \n",
-    "    train_idx.to('cuda'),                        # train_nid must be on GPU.\n",
-    "    sampler,\n",
-    "    sampling_output_dir=\"/raid/vjawa/obgn_products_sampling/\", # Path to save sampling results to, Change to the fastest IO path available\n",
-    "    device=torch.device('cuda'),    # The device argument must be GPU.\n",
-    "    num_workers=0,                 # Number of workers must be 0.\n",
-    "    batch_size=batch_size,\n",
-    "    batches_per_partition=50,\n",
-    "    seeds_per_call=50*batch_size,\n",
-    "    drop_last=False,\n",
-    "    shuffle=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "94003c30-756f-4cdb-856a-dec16a5fb4dc",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7.08 s ± 596 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit\n",
-    "batch_stats = {}\n",
-    "for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):\n",
-    "    batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "d8488e64-ba92-40c6-8e76-3898b1ca4317",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "del dataloader\n",
-    "del g"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b0a17523-53e9-4780-a9e1-eac4edd464e5",
-   "metadata": {},
-   "source": [
-    "# Pure DGL DataLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "0d147756-6410-4b71-aac1-9ef1e3df8fff",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from dgl.dataloading import DataLoader, NeighborSampler\n",
-    "import dgl"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "7cb2cc68-b4ff-43f2-8b12-b2808510b3f2",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "g, train_idx = load_dgl_dataset()\n",
-    "batch_size = 1024*2\n",
-    "fanout_vals = [25, 25]\n",
-    "sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout_vals)\n",
-    "dataloader = dgl.dataloading.DataLoader(\n",
-    "    g,                               \n",
-    "    train_idx.to(g.device),                        # train_nid must be on GPU.\n",
-    "    sampler,\n",
-    "    device=torch.device('cuda'),    # The device argument must be GPU.\n",
-    "    num_workers=0,                    # Number of workers must be 0.\n",
-    "    use_uva=False,\n",
-    "    batch_size=batch_size,\n",
-    "    drop_last=False,\n",
-    "    shuffle=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "7988aca2-7bfb-4200-ac87-008e30c670fb",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7.34 s ± 353 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit\n",
-    "dgl_batch_stats = {}\n",
-    "for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):\n",
-    "    dgl_batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "57022ea6-d2fc-4334-a086-82201e8814c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "del dataloader\n",
-    "del g"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "a1325b9b48ed9084674a30242e696fec2a1a44bbc4c0ef7ed1d4392854f3d402"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/cugraph-dgl/examples/graphsage/README.MD b/python/cugraph-dgl/examples/graphsage/README.MD
deleted file mode 100644
index ca867f0b634..00000000000
--- a/python/cugraph-dgl/examples/graphsage/README.MD
+++ /dev/null
@@ -1,26 +0,0 @@
-Inductive Representation Learning on Large Graphs (GraphSAGE)
-============
-
-- Paper link: [http://papers.nips.cc/paper/6703-inductive-representation-learning-on-large-graphs.pdf](http://papers.nips.cc/paper/6703-inductive-representation-learning-on-large-graphs.pdf)
-- Author's code repo: [https://github.com/williamleif/graphsage-simple](https://github.com/williamleif/graphsage-simple)
-
-For advanced usages, including training with multi-gpu/multi-node, and PyTorch Lightning, etc., more examples can be found in [advanced](https://github.com/dmlc/dgl/tree/master/examples/pytorch/graphsage/advanced) and [dist](https://github.com/dmlc/dgl/tree/master/examples/pytorch/graphsage/dist) directory.
-
-Requirements
-------------
-
-```bash
-mamba install ogb torchmetrics -c conda-forge
-```
-
-How to run
--------
-
-
-### Minibatch training for node classification
-
-Train w/ mini-batch sampling with cugraph_storage backend for node classification on "ogbn-products"
-
-```bash
-python3 node_classification.py --mode=gpu_cugraph_dgl
-```
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
deleted file mode 100644
index 0481f9566bc..00000000000
--- a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Example modified from:
-# https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/node_classification.py
-
-# Ignore Warning
-import warnings
-import time
-import cugraph_dgl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchmetrics.functional as MF
-import dgl
-import dgl.nn as dglnn
-from dgl.data import AsNodePredDataset
-from dgl.dataloading import (
-    DataLoader,
-    NeighborSampler,
-    MultiLayerFullNeighborSampler,
-)
-from ogb.nodeproppred import DglNodePropPredDataset
-import tqdm
-import argparse
-
-warnings.filterwarnings("ignore")
-
-
-def set_allocators():
-    import rmm
-    import cudf
-    import cupy
-    from rmm.allocators.torch import rmm_torch_allocator
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    mr = rmm.mr.CudaAsyncMemoryResource()
-    rmm.mr.set_current_device_resource(mr)
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    cudf.set_option("spill", True)
-
-
-class SAGE(nn.Module):
-    def __init__(self, in_size, hid_size, out_size):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        # three-layer GraphSAGE-mean
-        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
-        self.dropout = nn.Dropout(0.5)
-        self.hid_size = hid_size
-        self.out_size = out_size
-
-    def forward(self, blocks, x):
-        h = x
-        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
-            h = layer(block, h)
-            if l_id != len(self.layers) - 1:
-                h = F.relu(h)
-                h = self.dropout(h)
-        return h
-
-    def inference(self, g, device, batch_size):
-        """Conduct layer-wise inference to get all the node embeddings."""
-        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
-            all_node_ids, device=device
-        )
-
-        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
-        dataloader = DataLoader(
-            g,
-            torch.arange(g.num_nodes()).to(g.device),
-            sampler,
-            device=device,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=0,
-        )
-        buffer_device = torch.device("cpu")
-        pin_memory = buffer_device != device
-
-        for l_id, layer in enumerate(self.layers):
-            y = torch.empty(
-                g.num_nodes(),
-                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
-                device=buffer_device,
-                pin_memory=pin_memory,
-            )
-            feat = feat.to(device)
-            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
-                x = feat[input_nodes]
-                h = layer(blocks[0], x)  # len(blocks) = 1
-                if l_id != len(self.layers) - 1:
-                    h = F.relu(h)
-                    h = self.dropout(h)
-                # by design, our output nodes are contiguous
-                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
-            feat = y
-        return y
-
-
-def evaluate(model, graph, dataloader):
-    model.eval()
-    ys = []
-    y_hats = []
-    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        with torch.no_grad():
-            if isinstance(graph.ndata["feat"], dict):
-                x = graph.ndata["feat"]["_N"][input_nodes]
-                label = graph.ndata["label"]["_N"][output_nodes]
-            else:
-                x = graph.ndata["feat"][input_nodes]
-                label = graph.ndata["label"][output_nodes]
-            ys.append(label)
-            y_hats.append(model(blocks, x))
-    num_classes = y_hats[0].shape[1]
-    return MF.accuracy(
-        torch.cat(y_hats),
-        torch.cat(ys),
-        task="multiclass",
-        num_classes=num_classes,
-    )
-
-
-def layerwise_infer(device, graph, nid, model, batch_size):
-    model.eval()
-    with torch.no_grad():
-        pred = model.inference(graph, device, batch_size)  # pred in buffer_device
-        pred = pred[nid]
-        label = graph.ndata["label"]
-        if isinstance(label, dict):
-            label = label["_N"]
-        label = label[nid].to(device).to(pred.device)
-        num_classes = pred.shape[1]
-        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
-
-
-def train(args, device, g, dataset, model):
-    # create sampler & dataloader
-    train_idx = dataset.train_idx.to(device)
-    val_idx = dataset.val_idx.to(device)
-
-    use_uva = args.mode == "mixed"
-    batch_size = 1024
-    fanouts = [5, 10, 15]
-    sampler = NeighborSampler(fanouts)
-    train_dataloader = DataLoader(
-        g,
-        train_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-    val_dataloader = DataLoader(
-        g,
-        val_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-
-    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
-
-    for epoch in range(10):
-        model.train()
-        total_loss = 0
-        st = time.time()
-        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            if isinstance(g.ndata["feat"], dict):
-                x = g.ndata["feat"]["_N"][input_nodes]
-                y = g.ndata["label"]["_N"][output_nodes]
-            else:
-                x = g.ndata["feat"][input_nodes]
-                y = g.ndata["label"][output_nodes]
-
-            y_hat = model(blocks, x)
-            loss = F.cross_entropy(y_hat, y)
-            opt.zero_grad()
-            loss.backward()
-            opt.step()
-            total_loss += loss.item()
-
-        et = time.time()
-
-        print(
-            f"Time taken for epoch {epoch} with batch_size {batch_size} = {et - st} s"
-        )
-        acc = evaluate(model, g, val_dataloader)
-        print(
-            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
-                epoch, total_loss / (it + 1), acc.item()
-            )
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode",
-        default="gpu_cugraph_dgl",
-        choices=["cpu", "mixed", "gpu_dgl", "gpu_cugraph_dgl"],
-        help="Training mode."
-        " 'cpu' for CPU training,"
-        " 'mixed' for CPU-GPU mixed training, "
-        " 'gpu_dgl' for pure-GPU training, "
-        " 'gpu_cugraph_dgl' for pure-GPU training.",
-    )
-    args = parser.parse_args()
-    if not torch.cuda.is_available():
-        args.mode = "cpu"
-    if args.mode == "gpu_cugraph_dgl":
-        set_allocators()
-    print(f"Training in {args.mode} mode.")
-
-    # load and preprocess dataset
-    print("Loading data")
-    dataset = AsNodePredDataset(DglNodePropPredDataset("ogbn-products"))
-    g = dataset[0]
-    g = dgl.add_self_loop(g)
-    if args.mode == "gpu_cugraph_dgl":
-        g = cugraph_dgl.cugraph_storage_from_heterograph(g.to("cuda"))
-        del dataset.g
-
-    else:
-        g = g.to("cuda" if args.mode == "gpu_dgl" else "cpu")
-    device = torch.device(
-        "cpu" if args.mode == "cpu" or args.mode == "mixed" else "cuda"
-    )
-
-    # create GraphSAGE model
-    feat_shape = (
-        g.get_node_storage(key="feat", ntype="_N")
-        .fetch(torch.LongTensor([0]).to(device), device=device)
-        .shape[1]
-    )
-    print(feat_shape)
-    # no ndata in cugraph storage object
-    in_size = feat_shape
-    out_size = dataset.num_classes
-    model = SAGE(in_size, 256, out_size).to(device)
-
-    # model training
-    print("Training...")
-    train(args, device, g, dataset, model)
-
-    # test the model
-    print("Testing...")
-    acc = layerwise_infer(device, g, dataset.test_idx, model, batch_size=4096)
-    print("Test Accuracy {:.4f}".format(acc.item()))
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
deleted file mode 100644
index 56ac41c09b4..00000000000
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Example modified from:
-# https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/node_classification.py
-
-# Ignore Warning
-import warnings
-import tempfile
-import time
-import cugraph_dgl
-import cugraph_dgl.dataloading
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchmetrics.functional as MF
-import dgl
-import dgl.nn as dglnn
-from dgl.data import AsNodePredDataset
-from dgl.dataloading import (
-    DataLoader,
-    NeighborSampler,
-    MultiLayerFullNeighborSampler,
-)
-from ogb.nodeproppred import DglNodePropPredDataset
-import tqdm
-import argparse
-
-warnings.filterwarnings("ignore")
-
-
-def set_allocators():
-    import rmm
-    import cudf
-    import cupy
-    from rmm.allocators.torch import rmm_torch_allocator
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    mr = rmm.mr.CudaAsyncMemoryResource()
-    rmm.mr.set_current_device_resource(mr)
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    cudf.set_option("spill", True)
-
-
-class SAGE(nn.Module):
-    def __init__(self, in_size, hid_size, out_size):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        # three-layer GraphSAGE-mean
-        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
-        self.dropout = nn.Dropout(0.5)
-        self.hid_size = hid_size
-        self.out_size = out_size
-
-    def forward(self, blocks, x):
-        h = x
-        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
-            h = layer(block, h)
-            if l_id != len(self.layers) - 1:
-                h = F.relu(h)
-                h = self.dropout(h)
-        return h
-
-    def inference(self, g, device, batch_size):
-        """Conduct layer-wise inference to get all the node embeddings."""
-        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.ndata["feat"][all_node_ids].to(device)
-
-        if isinstance(g, cugraph_dgl.Graph):
-            sampler = cugraph_dgl.dataloading.NeighborSampler([-1])
-            loader_cls = cugraph_dgl.dataloading.FutureDataLoader
-        else:
-            sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
-            loader_cls = DataLoader
-        dataloader = loader_cls(
-            g,
-            torch.arange(g.num_nodes()).to(device),
-            sampler,
-            device=device,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=0,
-        )
-        buffer_device = torch.device("cpu")
-        pin_memory = buffer_device != device
-
-        for l_id, layer in enumerate(self.layers):
-            y = torch.empty(
-                g.num_nodes(),
-                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
-                device=buffer_device,
-                pin_memory=pin_memory,
-            )
-            feat = feat.to(device)
-            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
-                x = feat[input_nodes]
-                h = layer(blocks[0], x)  # len(blocks) = 1
-                if l_id != len(self.layers) - 1:
-                    h = F.relu(h)
-                    h = self.dropout(h)
-                # by design, our output nodes are contiguous
-                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
-            feat = y
-        return y
-
-
-def evaluate(model, graph, dataloader):
-    model.eval()
-    ys = []
-    y_hats = []
-    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        with torch.no_grad():
-            if isinstance(graph.ndata["feat"], dict):
-                x = graph.ndata["feat"]["_N"][input_nodes]
-                label = graph.ndata["label"]["_N"][output_nodes]
-            else:
-                x = graph.ndata["feat"][input_nodes]
-                label = graph.ndata["label"][output_nodes]
-            ys.append(label)
-            y_hats.append(model(blocks, x))
-    num_classes = y_hats[0].shape[1]
-    return MF.accuracy(
-        torch.cat(y_hats),
-        torch.cat(ys),
-        task="multiclass",
-        num_classes=num_classes,
-    )
-
-
-def layerwise_infer(device, graph, nid, model, batch_size):
-    model.eval()
-    with torch.no_grad():
-        pred = model.inference(graph, device, batch_size)  # pred in buffer_device
-        pred = pred[nid]
-        label = graph.ndata["label"]
-        if isinstance(label, dict):
-            label = label["_N"]
-        label = label[nid].to(device).to(pred.device)
-        num_classes = pred.shape[1]
-        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
-
-
-def train(args, device, g, dataset, model, directory):
-    # create sampler & dataloader
-    train_idx = dataset.train_idx.to(device)
-    val_idx = dataset.val_idx.to(device)
-
-    use_uva = args.mode == "mixed"
-    batch_size = 1024
-    fanouts = [5, 10, 15]
-    if isinstance(g, cugraph_dgl.Graph):
-        sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts, directory=directory)
-        loader_cls = cugraph_dgl.dataloading.FutureDataLoader
-    else:
-        sampler = NeighborSampler(fanouts)
-        loader_cls = DataLoader
-    train_dataloader = loader_cls(
-        g,
-        train_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-    val_dataloader = loader_cls(
-        g,
-        val_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-
-    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
-
-    for epoch in range(10):
-        model.train()
-        total_loss = 0
-        st = time.time()
-        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            if isinstance(g.ndata["feat"], dict):
-                x = g.ndata["feat"]["_N"][input_nodes]
-                y = g.ndata["label"]["_N"][output_nodes]
-            else:
-                x = g.ndata["feat"][input_nodes]
-                y = g.ndata["label"][output_nodes]
-
-            y_hat = model(blocks, x)
-            loss = F.cross_entropy(y_hat, y)
-            opt.zero_grad()
-            loss.backward()
-            opt.step()
-            total_loss += loss.item()
-
-        et = time.time()
-
-        print(
-            f"Time taken for epoch {epoch} with batch_size {batch_size} = {et - st} s"
-        )
-        acc = evaluate(model, g, val_dataloader)
-        print(
-            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
-                epoch, total_loss / (it + 1), acc.item()
-            )
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode",
-        default="gpu_cugraph_dgl",
-        choices=["cpu", "mixed", "gpu_dgl", "gpu_cugraph_dgl"],
-        help="Training mode."
-        " 'cpu' for CPU training,"
-        " 'mixed' for CPU-GPU mixed training, "
-        " 'gpu_dgl' for pure-GPU training, "
-        " 'gpu_cugraph_dgl' for pure-GPU training.",
-    )
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    args = parser.parse_args()
-    if not torch.cuda.is_available():
-        args.mode = "cpu"
-    if args.mode == "gpu_cugraph_dgl":
-        set_allocators()
-    print(f"Training in {args.mode} mode.")
-
-    # load and preprocess dataset
-    print("Loading data")
-    dataset = AsNodePredDataset(
-        DglNodePropPredDataset("ogbn-products", root=args.dataset_root)
-    )
-    g = dataset[0]
-    g = dgl.add_self_loop(g)
-    if args.mode == "gpu_cugraph_dgl":
-        g = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g.to("cuda"))
-        del dataset.g
-
-    else:
-        g = g.to("cuda" if args.mode == "gpu_dgl" else "cpu")
-    device = torch.device(
-        "cpu" if args.mode == "cpu" or args.mode == "mixed" else "cuda"
-    )
-
-    # create GraphSAGE model
-    feat_shape = g.ndata["feat"].shape[1]
-    print(feat_shape)
-
-    in_size = feat_shape
-    out_size = dataset.num_classes
-    model = SAGE(in_size, 256, out_size).to(device)
-
-    # model training
-    print("Training...")
-    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
-        train(args, device, g, dataset, model, directory)
-
-    # test the model
-    print("Testing...")
-    acc = layerwise_infer(device, g, dataset.test_idx, model, batch_size=4096)
-    print("Test Accuracy {:.4f}".format(acc.item()))
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
deleted file mode 100644
index 3e0c0454905..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# A graphsage GNN model using dgl for node classification
-# with three layers and mean aggregation
-import time
-import dgl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchmetrics.functional as MF
-from cugraph_dgl.nn import SAGEConv
-import tqdm
-
-
-class Sage(nn.Module):
-    def __init__(self, in_size, hid_size, out_size):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        # 2-layer GraphSAGE-mean
-        self.layers.append(SAGEConv(in_size, hid_size, "mean"))
-        self.layers.append(SAGEConv(hid_size, out_size, "mean"))
-        self.dropout = nn.Dropout(0.5)
-        self.hid_size = hid_size
-        self.out_size = out_size
-
-    def forward(self, blocks, x):
-        h = x
-        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
-            h = layer(block, h)
-            if l_id != len(self.layers) - 1:
-                h = F.relu(h)
-                h = self.dropout(h)
-        return h
-
-    def inference(self, g, batch_size, device):
-        """
-        Inference with the GraphSAGE model on
-        full neighbors (i.e. without neighbor sampling).
-        g : the entire graph.
-        batch_size : the node number of each inference output
-        device : the inference device
-        """
-        # During inference with sampling,
-        # multi-layer blocks are very inefficient because
-        # lots of computations in the first few layers are repeated.
-        # Therefore, we compute the representation of all nodes layer by layer.
-        # The nodes on each layer are of course splitted in batches.
-
-        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.ndata["feat"][all_node_ids].to(device)
-
-        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(
-            1, prefetch_node_feats=["feat"]
-        )
-        dataloader = dgl.dataloading.DataLoader(
-            g,
-            torch.arange(g.num_nodes(), dtype=torch.int32).to(g.device),
-            sampler,
-            device=device,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=0,
-        )
-        buffer_device = torch.device("cpu")
-        pin_memory = buffer_device != device
-
-        for l_id, layer in enumerate(self.layers):
-            y = torch.empty(
-                g.num_nodes(),
-                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
-                device=buffer_device,
-                pin_memory=pin_memory,
-            )
-            feat = feat.to(device)
-            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
-                x = feat[input_nodes]
-                h = layer(blocks[0], x)  # len(blocks) = 1
-                if l_id != len(self.layers) - 1:
-                    h = F.relu(h)
-                    h = self.dropout(h)
-                # by design, our output nodes are contiguous
-                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
-            feat = y
-        return y
-
-
-def layerwise_infer(graph, nid, model, batch_size, device):
-    model.eval()
-    with torch.no_grad():
-        pred = model.module.inference(
-            graph, batch_size, device
-        )  # pred in buffer_device
-        pred = pred[nid]
-        label = graph.ndata["label"]
-        if isinstance(label, dict):
-            label = label["_N"]
-        label = label[nid].to(pred.device)
-        num_classes = pred.shape[1]
-        label = label.squeeze(1)
-        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
-
-
-def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
-    st = time.time()
-    model.train()
-    for epoch in range(num_epochs):
-        total_loss = 0
-        for _, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            x = g.ndata["feat"][input_nodes].to(torch.float32)
-            y = g.ndata["label"][output_nodes].to(torch.int64)
-            y_hat = model(blocks, x)
-            y = y.squeeze(1)
-            loss = F.cross_entropy(y_hat, y)
-            opt.zero_grad()
-            loss.backward()
-            opt.step()
-            total_loss += loss.item()
-        print(
-            f"total loss: {total_loss} for epoch = {epoch} for rank = {rank}",
-            flush=True,
-        )
-    et = time.time()
-    print(
-        f"Total time taken for num_epochs {num_epochs} "
-        f"with batch_size {train_dataloader._batch_size} = {et - st} s on rank ={rank}"
-    )
-    if rank == 0:
-        val_acc = layerwise_infer(g, val_nid, model, 1024 * 5, "cuda")
-        print("---" * 30)
-        print("Validation Accuracy {:.4f}".format(val_acc))
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
deleted file mode 100644
index 11afe466014..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dgl
-import torch
-import time
-import tempfile
-import argparse
-import json
-import os
-import warnings
-
-from datetime import timedelta
-
-import cugraph_dgl
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-
-def init_ddp_worker(global_rank, local_rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=local_rank,
-        managed_memory=True,
-        pool_allocator=True,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(local_rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(local_rank)
-
-    cugraph_comms_init(
-        rank=global_rank, world_size=world_size, uid=cugraph_id, device=local_rank
-    )
-
-    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
-
-
-def load_dgl_dataset(dataset_root="dataset", dataset_name="ogbn-products"):
-    from ogb.nodeproppred import DglNodePropPredDataset
-
-    dataset = DglNodePropPredDataset(root=dataset_root, name=dataset_name)
-    split_idx = dataset.get_idx_split()
-    train_idx, valid_idx, test_idx = (
-        split_idx["train"],
-        split_idx["valid"],
-        split_idx["test"],
-    )
-    g, label = dataset[0]
-    g.ndata["label"] = label
-    if len(g.etypes) <= 1:
-        g = dgl.add_self_loop(g)
-    else:
-        for etype in g.etypes:
-            if etype[0] == etype[2]:
-                # only add self loops for src->dst
-                g = dgl.add_self_loop(g, etype=etype)
-
-    g = g.int()
-    idx = {
-        "train": train_idx.int(),
-        "valid": valid_idx.int(),
-        "test": test_idx.int(),
-    }
-
-    return g, idx, dataset.num_classes
-
-
-def partition_data(
-    g, split_idx, num_classes, edge_path, feature_path, label_path, meta_path
-):
-    # Split and save edge index
-    os.makedirs(
-        edge_path,
-        exist_ok=True,
-    )
-    src, dst = g.all_edges(form="uv", order="eid")
-    edge_index = torch.stack([src, dst])
-    for (r, e) in enumerate(torch.tensor_split(edge_index, world_size, dim=1)):
-        rank_path = os.path.join(edge_path, f"rank={r}.pt")
-        torch.save(
-            e.clone(),
-            rank_path,
-        )
-
-    # Split and save features
-    os.makedirs(
-        feature_path,
-        exist_ok=True,
-    )
-
-    nix = torch.arange(g.num_nodes())
-    for (r, f) in enumerate(torch.tensor_split(nix, world_size)):
-        feat_path = os.path.join(feature_path, f"rank={r}_feat.pt")
-        torch.save(g.ndata["feat"][f], feat_path)
-
-        label_f_path = os.path.join(feature_path, f"rank={r}_label.pt")
-        torch.save(g.ndata["label"][f], label_f_path)
-
-    # Split and save labels
-    os.makedirs(
-        label_path,
-        exist_ok=True,
-    )
-    for (d, i) in split_idx.items():
-        i_parts = torch.tensor_split(i, world_size)
-        for r, i_part in enumerate(i_parts):
-            rank_path = os.path.join(label_path, f"rank={r}")
-            os.makedirs(rank_path, exist_ok=True)
-            torch.save(i_part, os.path.join(rank_path, f"{d}.pt"))
-
-    # Save metadata
-    meta = {
-        "num_classes": int(num_classes),
-        "num_nodes": int(g.num_nodes()),
-    }
-    with open(meta_path, "w") as f:
-        json.dump(meta, f)
-
-
-def load_partitioned_data(rank, edge_path, feature_path, label_path, meta_path):
-    g = cugraph_dgl.Graph(
-        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
-    )
-
-    # Load metadata
-    with open(meta_path, "r") as f:
-        meta = json.load(f)
-
-    # Load labels
-    split_idx = {}
-    for split in ["train", "test", "valid"]:
-        split_idx[split] = torch.load(
-            os.path.join(label_path, f"rank={rank}", f"{split}.pt")
-        )
-
-    # Load features
-    feat_t = torch.load(os.path.join(feature_path, f"rank={rank}_feat.pt"))
-    label_f_t = torch.load(os.path.join(feature_path, f"rank={rank}_label.pt"))
-    ndata = {"feat": feat_t, "label": label_f_t}
-    g.add_nodes(meta["num_nodes"], data=ndata)
-
-    # Load edge index
-    src, dst = torch.load(os.path.join(edge_path, f"rank={rank}.pt"))
-    g.add_edges(src.cuda(), dst.cuda(), data=None)
-
-    return g, split_idx, meta["num_classes"]
-
-
-def create_dataloader(gs, train_idx, device, temp_dir, stage):
-    import cugraph_dgl
-
-    temp_path = os.path.join(temp_dir, f"{stage}_{device}")
-    os.mkdir(temp_path)
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        [10, 20],
-        directory=temp_path,
-        batches_per_partition=10,
-    )
-
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        gs,
-        train_idx,
-        sampler,
-        device=device,  # Put the sampled MFGs on CPU or GPU
-        use_ddp=True,  # Make it work with distributed data parallel
-        batch_size=1024,
-        shuffle=False,  # Whether to shuffle the nodes for every epoch
-        drop_last=False,
-        num_workers=0,
-    )
-    return dataloader
-
-
-def run_workflow(
-    global_rank, local_rank, world_size, g, split_idx, num_classes, temp_dir
-):
-    from model import Sage, train_model
-
-    # Below sets gpu_number
-    dev_id = local_rank
-    device = torch.device(f"cuda:{dev_id}")
-
-    dataloader = create_dataloader(g, split_idx["train"], device, temp_dir, "train")
-    print("Dataloader Creation Complete", flush=True)
-    num_feats = g.ndata["feat"].shape[1]
-    hid_size = 256
-    # Load Training example
-    model = Sage(num_feats, hid_size, num_classes).to(device)
-    model = torch.nn.parallel.DistributedDataParallel(
-        model,
-        device_ids=[device],
-        output_device=device,
-    )
-    torch.distributed.barrier()
-    n_epochs = 10
-    total_st = time.time()
-    opt = torch.optim.Adam(model.parameters(), lr=0.01)
-    train_model(model, g, opt, dataloader, n_epochs, global_rank, split_idx["valid"])
-    torch.distributed.barrier()
-    total_et = time.time()
-    print(
-        f"Total time taken on n_epochs {n_epochs} = {total_et - total_st} s",
-        f"measured by worker = {global_rank}",
-    )
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-if __name__ == "__main__":
-    if "LOCAL_RANK" in os.environ:
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--dataset_root", type=str, default="dataset")
-        parser.add_argument("--tempdir_root", type=str, default=None)
-        parser.add_argument("--dataset", type=str, default="ogbn-products")
-        parser.add_argument("--skip_partition", action="store_true")
-        args = parser.parse_args()
-
-        torch.distributed.init_process_group(
-            "nccl",
-            timeout=timedelta(minutes=60),
-        )
-        world_size = torch.distributed.get_world_size()
-        global_rank = torch.distributed.get_rank()
-        local_rank = int(os.environ["LOCAL_RANK"])
-        device = torch.device(local_rank)
-
-        # Create the uid needed for cuGraph comms
-        if global_rank == 0:
-            cugraph_id = [cugraph_comms_create_unique_id()]
-        else:
-            cugraph_id = [None]
-        torch.distributed.broadcast_object_list(cugraph_id, src=0, device=device)
-        cugraph_id = cugraph_id[0]
-
-        init_ddp_worker(global_rank, local_rank, world_size, cugraph_id)
-
-        # Split the data
-        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
-        feature_path = os.path.join(args.dataset_root, args.dataset + "_fea_part")
-        label_path = os.path.join(args.dataset_root, args.dataset + "_label_part")
-        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
-
-        if not args.skip_partition and global_rank == 0:
-            partition_data(
-                *load_dgl_dataset(args.dataset_root, args.dataset),
-                edge_path,
-                feature_path,
-                label_path,
-                meta_path,
-            )
-        torch.distributed.barrier()
-
-        print("loading partitions...")
-        g, split_idx, num_classes = load_partitioned_data(
-            rank=global_rank,
-            edge_path=edge_path,
-            feature_path=feature_path,
-            label_path=label_path,
-            meta_path=meta_path,
-        )
-        print(f"rank {global_rank} has loaded its partition")
-        torch.distributed.barrier()
-
-        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
-            run_workflow(
-                global_rank,
-                local_rank,
-                world_size,
-                g,
-                split_idx,
-                num_classes,
-                directory,
-            )
-    else:
-        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
deleted file mode 100644
index 001d7fb82dc..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dgl
-import torch
-import time
-import tempfile
-import argparse
-import os
-
-import cugraph_dgl
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-
-def initalize_pytorch_worker(dev_id):
-    import cupy as cp
-    import rmm
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    dev = cp.cuda.Device(
-        dev_id
-    )  # Create cuda context on the right gpu, defaults to gpu-0
-    dev.use()
-    rmm.reinitialize(
-        pool_allocator=True,
-        initial_pool_size=10e9,
-        maximum_pool_size=15e9,
-        devices=[dev_id],
-    )
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(dev_id)
-    cp.cuda.set_allocator(rmm_cupy_allocator)
-    print("device_id", dev_id, flush=True)
-
-
-def load_dgl_dataset(
-    dataset_name="ogbn-products",
-    dataset_root=None,
-):
-    from ogb.nodeproppred import DglNodePropPredDataset
-
-    dataset = DglNodePropPredDataset(name=dataset_name, root=dataset_root)
-    split_idx = dataset.get_idx_split()
-    train_idx, valid_idx, test_idx = (
-        split_idx["train"],
-        split_idx["valid"],
-        split_idx["test"],
-    )
-    g, label = dataset[0]
-    g.ndata["label"] = label
-    if len(g.etypes) <= 1:
-        g = dgl.add_self_loop(g)
-    else:
-        for etype in g.etypes:
-            if etype[0] == etype[2]:
-                # only add self loops for src->dst
-                g = dgl.add_self_loop(g, etype=etype)
-
-    g = g.int()
-    train_idx = train_idx.int()
-    valid_idx = valid_idx.int()
-    test_idx = test_idx.int()
-    return g, train_idx, valid_idx, test_idx, dataset.num_classes
-
-
-def create_cugraph_graphstore_from_dgl_dataset(dataset, rank, world_size):
-    (g, train_idx, valid_idx, test_idx, num_classes) = dataset
-    # Partition the data
-    cg = cugraph_dgl.Graph(
-        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
-    )
-
-    nix = torch.tensor_split(torch.arange(g.num_nodes()), world_size)[rank]
-    ndata = {k: g.ndata[k][nix].cuda() for k in g.ndata.keys()}
-
-    eix = torch.tensor_split(torch.arange(g.num_edges()), world_size)[rank]
-    src, dst = g.all_edges(form="uv", order="eid")
-    edata = {k: g.edata[k][eix].cuda() for k in g.edata.keys()}
-
-    cg.add_nodes(g.num_nodes(), data=ndata)
-    cg.add_edges(
-        torch.tensor_split(src, world_size)[rank].cuda(),
-        torch.tensor_split(dst, world_size)[rank].cuda(),
-        data=edata,
-    )
-
-    return (
-        cg,
-        torch.tensor_split(train_idx, world_size)[rank].to(torch.int64),
-        torch.tensor_split(valid_idx, world_size)[rank].to(torch.int64),
-        torch.tensor_split(test_idx, world_size)[rank].to(torch.int64),
-        num_classes,
-    )
-
-
-def create_dataloader(gs, train_idx, device, temp_dir, stage):
-    import cugraph_dgl
-
-    temp_path = os.path.join(temp_dir, f"{stage}_{device}")
-    os.mkdir(temp_path)
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        [10, 20],
-        directory=temp_path,
-        batches_per_partition=10,
-    )
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        gs,
-        train_idx,
-        sampler,
-        device=device,  # Put the sampled MFGs on CPU or GPU
-        use_ddp=True,  # Make it work with distributed data parallel
-        batch_size=1024,
-        shuffle=False,  # Whether to shuffle the nodes for every epoch
-        drop_last=False,
-        num_workers=0,
-    )
-    return dataloader
-
-
-def run_workflow(rank, world_size, cugraph_id, dataset, temp_dir):
-    from model import Sage, train_model
-
-    # Below sets gpu_number
-    dev_id = rank
-    initalize_pytorch_worker(dev_id)
-    device = torch.device(f"cuda:{dev_id}")
-
-    # Pytorch training worker initialization
-    dist_init_method = "tcp://{master_ip}:{master_port}".format(
-        master_ip="127.0.0.1", master_port="12346"
-    )
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-        init_method=dist_init_method,
-        world_size=world_size,
-        rank=rank,
-    )
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-    wm_init(rank, world_size, rank, world_size)
-
-    print(f"rank {rank}.", flush=True)
-    print("Initalized across GPUs.")
-
-    (
-        gs,
-        train_idx,
-        valid_idx,
-        test_idx,
-        num_classes,
-    ) = create_cugraph_graphstore_from_dgl_dataset(
-        dataset,
-        rank,
-        world_size,
-    )
-    del dataset
-
-    torch.distributed.barrier()
-    print(f"Loading graph to worker {rank} is complete", flush=True)
-
-    dataloader = create_dataloader(gs, train_idx, device, temp_dir, "train")
-    print("Dataloader Creation Complete", flush=True)
-    num_feats = gs.ndata["feat"].shape[1]
-    hid_size = 256
-    # Load Training example
-    model = Sage(num_feats, hid_size, num_classes).to(device)
-    model = torch.nn.parallel.DistributedDataParallel(
-        model,
-        device_ids=[device],
-        output_device=device,
-    )
-    torch.distributed.barrier()
-    n_epochs = 10
-    total_st = time.time()
-    opt = torch.optim.Adam(model.parameters(), lr=0.01)
-    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
-    torch.distributed.barrier()
-    total_et = time.time()
-    print(
-        f"Total time taken on n_epochs {n_epochs} = {total_et - total_st} s",
-        f"measured by worker = {rank}",
-    )
-
-    torch.cuda.synchronize()
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    parser.add_argument("--dataset", type=str, default="ogbn-products")
-    args = parser.parse_args()
-
-    from rmm.allocators.torch import rmm_torch_allocator
-
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-    # Create the uid needed for cuGraph comms
-    cugraph_id = cugraph_comms_create_unique_id()
-
-    ds = load_dgl_dataset(args.dataset, args.dataset_root)
-
-    world_size = torch.cuda.device_count()
-
-    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
-        torch.multiprocessing.spawn(
-            run_workflow,
-            args=(world_size, cugraph_id, ds, directory),
-            nprocs=world_size,
-        )
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
deleted file mode 100644
index e32dff6cbb0..00000000000
--- a/python/cugraph-dgl/pyproject.toml
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-[build-system]
-
-requires = [
-    "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "setuptools>=61.0.0",
-    "wheel",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "rapids_build_backend.build"
-
-[project]
-name = "cugraph-dgl"
-dynamic = ["version"]
-description = "cugraph extensions for DGL"
-readme = { file = "README.md", content-type = "text/markdown" }
-authors = [
-    { name = "NVIDIA Corporation" },
-]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.10"
-classifiers = [
-    "Intended Audience :: Developers",
-    "Programming Language :: Python",
-]
-dependencies = [
-    "cugraph==25.2.*,>=0.0.0a0",
-    "numba>=0.57",
-    "numpy>=1.23,<3.0a0",
-    "pylibcugraphops==25.2.*,>=0.0.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.optional-dependencies]
-test = [
-    "pandas",
-    "pylibwholegraph==25.2.*,>=0.0.0a0",
-    "pytest",
-    "pytest-benchmark",
-    "pytest-cov",
-    "pytest-xdist",
-    "scipy",
-    "tensordict>=0.1.2",
-    "torch>=2.3,<2.4.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.urls]
-Homepage = "https://github.com/rapidsai/cugraph"
-Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
-
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cugraph_dgl/VERSION"}
-
-[tool.setuptools.packages.find]
-include = [
-    "cugraph_dgl*",
-]
-
-[tool.rapids-build-backend]
-build-backend = "setuptools.build_meta"
-dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-dgl/tests/test_version.py b/python/cugraph-dgl/tests/test_version.py
deleted file mode 100644
index 343e4fb2675..00000000000
--- a/python/cugraph-dgl/tests/test_version.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import cugraph_dgl
-
-
-def test_version_constants_are_populated():
-    # __git_commit__ will only be non-empty in a built distribution
-    assert isinstance(cugraph_dgl.__git_commit__, str)
-
-    # __version__ should always be non-empty
-    assert isinstance(cugraph_dgl.__version__, str)
-    assert len(cugraph_dgl.__version__) > 0
diff --git a/python/cugraph-equivariant/LICENSE b/python/cugraph-equivariant/LICENSE
deleted file mode 120000
index 30cff7403da..00000000000
--- a/python/cugraph-equivariant/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-equivariant/README.md b/python/cugraph-equivariant/README.md
deleted file mode 100644
index d5de8852709..00000000000
--- a/python/cugraph-equivariant/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# cugraph-equivariant
-
-## Description
-
-cugraph-equivariant library provides fast symmetry-preserving (equivariant) operations and convolutional layers, to accelerate the equivariant neural networks in drug discovery and other domains.
diff --git a/python/cugraph-equivariant/cugraph_equivariant/VERSION b/python/cugraph-equivariant/cugraph_equivariant/VERSION
deleted file mode 120000
index d62dc733efd..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../../VERSION
\ No newline at end of file
diff --git a/python/cugraph-equivariant/cugraph_equivariant/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/__init__.py
deleted file mode 100644
index 20507bd9329..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph_equivariant._version import __git_commit__, __version__
diff --git a/python/cugraph-equivariant/cugraph_equivariant/_version.py b/python/cugraph-equivariant/cugraph_equivariant/_version.py
deleted file mode 100644
index 940ebac74f7..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/_version.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib.resources
-
-# Read VERSION file from the module that is symlinked to VERSION file
-# in the root of the repo at build time or copied to the module at
-# installation. VERSION is a separate file that allows CI build-time scripts
-# to update version info (including commit hashes) without modifying
-# source files.
-__version__ = (
-    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
-)
-try:
-    __git_commit__ = (
-        importlib.resources.files(__package__)
-        .joinpath("GIT_COMMIT")
-        .read_text()
-        .strip()
-    )
-except FileNotFoundError:
-    __git_commit__ = ""
-
-__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py
deleted file mode 100644
index 8f4d8de0042..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .tensor_product_conv import FullyConnectedTensorProductConv
-
-DiffDockTensorProductConv = FullyConnectedTensorProductConv
-
-__all__ = [
-    "FullyConnectedTensorProductConv",
-    "DiffDockTensorProductConv",
-]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py b/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py
deleted file mode 100644
index 923edbfc44a..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Sequence, Union, NamedTuple
-
-import torch
-from torch import nn
-from e3nn import o3
-from e3nn.nn import BatchNorm
-
-from cugraph_equivariant.utils import scatter_reduce
-
-try:
-    from pylibcugraphops.pytorch.operators import FusedFullyConnectedTensorProduct
-except ImportError as exc:
-    raise RuntimeError(
-        "FullyConnectedTensorProductConv is no longer supported in "
-        "cugraph-equivariant starting from version 24.08. It will be migrated "
-        "to the new `cuequivariance` package. Please use 24.06 release for the "
-        "legacy interface."
-    ) from exc
-
-
-class Graph(NamedTuple):
-    edge_index: torch.Tensor
-    size: tuple[int, int]
-
-
-class FullyConnectedTensorProductConv(nn.Module):
-    r"""Message passing layer for tensor products in DiffDock-like architectures.
-    The left operand of tensor product is the spherical harmonic representation
-    of edge vector; the right operand consists of node features in irreps.
-
-    .. math::
-        \sum_{b \in \mathcal{N}_a} Y\left(\hat{r}_{a b}\right)
-        \otimes_{\psi_{a b}} \mathbf{h}_b
-
-    where the path weights :math:`\psi_{a b}` can be constructed from edge
-    embeddings and scalar features using an MLP:
-
-    .. math::
-        \psi_{a b} = \operatorname{MLP}
-        \left(e_{a b}, \mathbf{h}_a^0, \mathbf{h}_b^0\right)
-
-    Users have the option to either directly input the weights or provide the
-    MLP parameters and scalar features from edges and nodes.
-
-    Parameters
-    ----------
-    in_irreps : e3nn.o3.Irreps
-        Irreps for the input node features.
-
-    sh_irreps : e3nn.o3.Irreps
-        Irreps for the spherical harmonic representations of edge vectors.
-
-    out_irreps : e3nn.o3.Irreps
-        Irreps for the output.
-
-    batch_norm : bool, optional (default=True)
-        If true, batch normalization is applied.
-
-    mlp_channels : sequence of ints, optional (default=None)
-        A sequence of integers defining number of neurons in each layer in MLP
-        before the output layer. If `None`, no MLP will be added. The input layer
-        contains edge embeddings and node scalar features.
-
-    mlp_activation : nn.Module or sequence of nn.Module, optional (default=nn.GELU())
-        A sequence of functions to be applied in between linear layers in MLP,
-        e.g., `nn.Sequential(nn.ReLU(), nn.Dropout(0.4))`.
-
-    e3nn_compat_mode: bool, optional (default=False)
-        cugraph-ops and e3nn use different memory layout for Irreps-tensors.
-        The last (fastest moving) dimension is num_channels for cugraph-ops and
-        ir.dim for e3nn. When enabled, the input and output of this layer will
-        follow e3nn's memory layout.
-
-    Examples
-    --------
-    Case 1: MLP with the input layer having 6 channels and 2 hidden layers
-    having 16 channels. edge_emb.size(1) must match the size of the input layer: 6
-
-    >>> conv1 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
-    >>>     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU()).cuda()
-    >>> out = conv1(src_features, edge_sh, edge_emb, graph)
-
-    Case 2: If `edge_emb` is constructed by concatenating scalar features from
-    edges, sources and destinations, as in DiffDock, the layer can accept each
-    scalar component separately:
-
-    >>> conv2 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
-    >>>     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU()).cuda()
-    >>> out = conv2(src_features, edge_sh, edge_scalars, graph,
-    >>>     src_scalars=src_scalars, dst_scalars=dst_scalars)
-
-    This allows a smaller GEMM in the first MLP layer by performing GEMM on each
-    component before indexing. The first-layer weights are split into sections
-    for edges, sources and destinations, in that order.This is equivalent to
-
-    >>> src, dst = graph.edge_index
-    >>> edge_emb = torch.hstack((edge_scalars, src_scalars[src], dst_scalars[dst]))
-    >>> out = conv2(src_features, edge_sh, edge_emb, graph)
-
-    Case 3: No MLP, `edge_emb` will be directly used as the tensor product weights:
-
-    >>> conv3 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
-    >>>     mlp_channels=None).cuda()
-    >>> out = conv3(src_features, edge_sh, edge_emb, graph)
-
-    """
-
-    def __init__(
-        self,
-        in_irreps: o3.Irreps,
-        sh_irreps: o3.Irreps,
-        out_irreps: o3.Irreps,
-        batch_norm: bool = True,
-        mlp_channels: Optional[Sequence[int]] = None,
-        mlp_activation: Union[nn.Module, Sequence[nn.Module]] = nn.GELU(),
-        e3nn_compat_mode: bool = False,
-    ):
-        super().__init__()
-        self.in_irreps = in_irreps
-        self.out_irreps = out_irreps
-        self.sh_irreps = sh_irreps
-        self.e3nn_compat_mode = e3nn_compat_mode
-
-        self.tp = FusedFullyConnectedTensorProduct(
-            in_irreps, sh_irreps, out_irreps, e3nn_compat_mode=e3nn_compat_mode
-        )
-
-        self.batch_norm = BatchNorm(out_irreps) if batch_norm else None
-
-        if mlp_activation is None:
-            mlp_activation = []
-        elif hasattr(mlp_activation, "__len__") and hasattr(
-            mlp_activation, "__getitem__"
-        ):
-            mlp_activation = list(mlp_activation)
-        else:
-            mlp_activation = [mlp_activation]
-
-        if mlp_channels is not None:
-            dims = list(mlp_channels) + [self.tp.weight_numel]
-            mlp = []
-            for i in range(len(dims) - 1):
-                mlp.append(nn.Linear(dims[i], dims[i + 1]))
-                if i != len(dims) - 2:
-                    mlp.extend(mlp_activation)
-            self.mlp = nn.Sequential(*mlp)
-        else:
-            self.mlp = None
-
-    def forward(
-        self,
-        src_features: torch.Tensor,
-        edge_sh: torch.Tensor,
-        edge_emb: torch.Tensor,
-        graph: tuple[torch.Tensor, tuple[int, int]],
-        src_scalars: Optional[torch.Tensor] = None,
-        dst_scalars: Optional[torch.Tensor] = None,
-        reduce: str = "mean",
-        edge_envelope: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass.
-
-        Parameters
-        ----------
-        src_features : torch.Tensor
-            Source node features.
-            Shape: (num_src_nodes, in_irreps.dim)
-
-        edge_sh : torch.Tensor
-            The spherical harmonic representations of the edge vectors.
-            Shape: (num_edges, sh_irreps.dim)
-
-        edge_emb: torch.Tensor
-            Edge embeddings that are fed into MLPs to generate tensor product weights.
-            Shape: (num_edges, dim), where `dim` should be:
-            - `tp.weight_numel` when the layer does not contain MLPs.
-            - num_edge_scalars, when scalar features from edges, sources and
-              destinations are passed in separately.
-
-        graph : tuple
-            A tuple that stores the graph information, with the first element being
-            the adjacency matrix in COO, and the second element being its shape:
-            (num_src_nodes, num_dst_nodes).
-
-        src_scalars: torch.Tensor, optional
-            Scalar features of source nodes. See examples for usage.
-            Shape: (num_src_nodes, num_src_scalars)
-
-        dst_scalars: torch.Tensor, optional
-            Scalar features of destination nodes. See examples for usage.
-            Shape: (num_dst_nodes, num_dst_scalars)
-
-        reduce : str, optional (default="mean")
-            Reduction operator. Choose between "mean" and "sum".
-
-        edge_envelope: torch.Tensor, optional
-            Typically used as attenuation factors to fade out messages coming
-            from nodes close to the cutoff distance used to create the graph.
-            This is important to make the model smooth to the changes in node's
-            coordinates.
-            Shape: (num_edges,)
-
-        Returns
-        -------
-        torch.Tensor
-            Output node features.
-            Shape: (num_dst_nodes, out_irreps.dim)
-        """
-        edge_emb_size = edge_emb.size(-1)
-        src_scalars_size = 0 if src_scalars is None else src_scalars.size(-1)
-        dst_scalars_size = 0 if dst_scalars is None else dst_scalars.size(-1)
-
-        if self.mlp is None:
-            if self.tp.weight_numel != edge_emb_size:
-                raise RuntimeError(
-                    f"When MLP is not present, edge_emb's last dimension must "
-                    f"equal tp.weight_numel (but got {edge_emb_size} and "
-                    f"{self.tp.weight_numel})"
-                )
-        else:
-            total_size = edge_emb_size + src_scalars_size + dst_scalars_size
-            if self.mlp[0].in_features != total_size:
-                raise RuntimeError(
-                    f"The size of MLP's input layer ({self.mlp[0].in_features}) "
-                    f"does not match the total number of scalar features from "
-                    f"edge_emb, src_scalars and dst_scalars ({total_size})"
-                )
-
-        if reduce not in ["mean", "sum"]:
-            raise RuntimeError(
-                f"reduce argument must be either 'mean' or 'sum', got {reduce}."
-            )
-
-        (src, dst), (num_src_nodes, num_dst_nodes) = graph
-
-        if self.mlp is not None:
-            if src_scalars is None and dst_scalars is None:
-                tp_weights = self.mlp(edge_emb)
-            else:
-                w_edge, w_src, w_dst = torch.split(
-                    self.mlp[0].weight,
-                    (edge_emb_size, src_scalars_size, dst_scalars_size),
-                    dim=-1,
-                )
-                tp_weights = edge_emb @ w_edge.T + self.mlp[0].bias
-
-                if src_scalars is not None:
-                    tp_weights += (src_scalars @ w_src.T)[src]
-
-                if dst_scalars is not None:
-                    tp_weights += (dst_scalars @ w_dst.T)[dst]
-
-                tp_weights = self.mlp[1:](tp_weights)
-        else:
-            tp_weights = edge_emb
-
-        out = self.tp(src_features[src], edge_sh, tp_weights)
-
-        if edge_envelope is not None:
-            out = out * edge_envelope.view(-1, 1)
-
-        dtype = out.dtype
-        out = scatter_reduce(
-            out.float(), dst, dim=0, dim_size=num_dst_nodes, reduce=reduce
-        ).to(dtype)
-
-        if self.batch_norm:
-            out = self.batch_norm(out)
-
-        return out
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py b/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
deleted file mode 100644
index 806e03e6d76..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-
-
-@pytest.fixture
-def example_scatter_data():
-    src_feat = torch.Tensor([3, 1, 0, 1, 1, 2])
-    dst_indices = torch.Tensor([0, 1, 2, 2, 3, 1])
-
-    results = {
-        "sum": torch.Tensor([3.0, 3.0, 1.0, 1.0]),
-        "mean": torch.Tensor([3.0, 1.5, 0.5, 1.0]),
-        "prod": torch.Tensor([3.0, 2.0, 0.0, 1.0]),
-        "amax": torch.Tensor([3.0, 2.0, 1.0, 1.0]),
-        "amin": torch.Tensor([3.0, 1.0, 0.0, 1.0]),
-    }
-
-    return src_feat, dst_indices, results
-
-
-@pytest.fixture
-def empty_scatter_data():
-    src_feat = torch.empty((0, 41))
-    dst_indices = torch.empty((0,))
-
-    return src_feat, dst_indices
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/pytest.ini b/python/cugraph-equivariant/cugraph_equivariant/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py b/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
deleted file mode 100644
index d28a32edcb1..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-from cugraph_equivariant.utils import scatter_reduce
-
-
-@pytest.mark.parametrize("reduce", ["sum", "mean", "prod", "amax", "amin"])
-def test_scatter_reduce(example_scatter_data, reduce):
-    device = torch.device("cuda")
-    src, index, out_true = example_scatter_data
-    src = src.to(device)
-    index = index.to(device)
-
-    out = scatter_reduce(src, index, dim=0, dim_size=None, reduce=reduce)
-
-    assert torch.allclose(out.cpu(), out_true[reduce])
-
-
-def test_scatter_reduce_empty(empty_scatter_data):
-    device = torch.device("cuda")
-    src, index = empty_scatter_data
-    src = src.to(device)
-    index = index.to(device)
-
-    out = scatter_reduce(src, index, dim=0, dim_size=None)
-
-    assert out.numel() == 0
-    assert out.size(1) == src.size(1)
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py b/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py
deleted file mode 100644
index ce325c47aa0..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    from cugraph_equivariant.nn import FullyConnectedTensorProductConv
-except RuntimeError:
-    pytest.skip(
-        "Migrated to cuequivariance package starting from 24.08.",
-        allow_module_level=True,
-    )
-
-import torch
-from torch import nn
-from e3nn import o3
-from cugraph_equivariant.nn.tensor_product_conv import Graph
-
-device = torch.device("cuda")
-
-
-def create_random_graph(
-    num_src_nodes,
-    num_dst_nodes,
-    num_edges,
-    dtype=None,
-    device=None,
-):
-    row = torch.randint(num_src_nodes, (num_edges,), dtype=dtype, device=device)
-    col = torch.randint(num_dst_nodes, (num_edges,), dtype=dtype, device=device)
-    edge_index = torch.stack([row, col], dim=0)
-
-    return Graph(edge_index, (num_src_nodes, num_dst_nodes))
-
-
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("e3nn_compat_mode", [True, False])
-@pytest.mark.parametrize("batch_norm", [True, False])
-@pytest.mark.parametrize(
-    "mlp_channels, mlp_activation, scalar_sizes",
-    [
-        [(30, 8, 8), nn.Sequential(nn.Dropout(0.3), nn.ReLU()), (15, 15, 0)],
-        [(7,), nn.GELU(), (2, 3, 2)],
-        [None, None, None],
-    ],
-)
-def test_tensor_product_conv_equivariance(
-    mlp_channels, mlp_activation, scalar_sizes, batch_norm, e3nn_compat_mode, dtype
-):
-    torch.manual_seed(12345)
-    to_kwargs = {"device": device, "dtype": dtype}
-
-    in_irreps = o3.Irreps("10x0e + 10x1e")
-    out_irreps = o3.Irreps("20x0e + 10x1e")
-    sh_irreps = o3.Irreps.spherical_harmonics(lmax=2)
-
-    tp_conv = FullyConnectedTensorProductConv(
-        in_irreps=in_irreps,
-        sh_irreps=sh_irreps,
-        out_irreps=out_irreps,
-        mlp_channels=mlp_channels,
-        mlp_activation=mlp_activation,
-        batch_norm=batch_norm,
-        e3nn_compat_mode=e3nn_compat_mode,
-    ).to(**to_kwargs)
-
-    num_src_nodes, num_dst_nodes = 9, 7
-    num_edges = 40
-    graph = create_random_graph(num_src_nodes, num_dst_nodes, num_edges, device=device)
-
-    edge_sh = torch.randn(num_edges, sh_irreps.dim, **to_kwargs)
-    src_features = torch.randn(num_src_nodes, in_irreps.dim, **to_kwargs)
-
-    rot = o3.rand_matrix()
-    D_in = tp_conv.in_irreps.D_from_matrix(rot).to(**to_kwargs)
-    D_sh = tp_conv.sh_irreps.D_from_matrix(rot).to(**to_kwargs)
-    D_out = tp_conv.out_irreps.D_from_matrix(rot).to(**to_kwargs)
-
-    if mlp_channels is None:
-        edge_emb = torch.randn(num_edges, tp_conv.tp.weight_numel, **to_kwargs)
-        src_scalars = dst_scalars = None
-    else:
-        if scalar_sizes:
-            edge_emb = torch.randn(num_edges, scalar_sizes[0], **to_kwargs)
-            src_scalars = (
-                None
-                if scalar_sizes[1] == 0
-                else torch.randn(num_src_nodes, scalar_sizes[1], **to_kwargs)
-            )
-            dst_scalars = (
-                None
-                if scalar_sizes[2] == 0
-                else torch.randn(num_dst_nodes, scalar_sizes[2], **to_kwargs)
-            )
-        else:
-            edge_emb = torch.randn(num_edges, tp_conv.mlp[0].in_features, **to_kwargs)
-            src_scalars = dst_scalars = None
-
-    # rotate before
-    torch.manual_seed(12345)
-    out_before = tp_conv(
-        src_features=src_features @ D_in.T,
-        edge_sh=edge_sh @ D_sh.T,
-        edge_emb=edge_emb,
-        graph=graph,
-        src_scalars=src_scalars,
-        dst_scalars=dst_scalars,
-    )
-
-    # rotate after
-    torch.manual_seed(12345)
-    out_after = (
-        tp_conv(
-            src_features=src_features,
-            edge_sh=edge_sh,
-            edge_emb=edge_emb,
-            graph=graph,
-            src_scalars=src_scalars,
-            dst_scalars=dst_scalars,
-        )
-        @ D_out.T
-    )
-
-    atol = 1e-3 if dtype == torch.float32 else 1e-1
-    if e3nn_compat_mode:
-        assert torch.allclose(out_before, out_after, rtol=1e-4, atol=atol)
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/test_version.py b/python/cugraph-equivariant/cugraph_equivariant/tests/test_version.py
deleted file mode 100644
index e8b484fe16c..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/tests/test_version.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import cugraph_equivariant
-
-
-def test_version_constants_are_populated():
-    # __git_commit__ will only be non-empty in a built distribution
-    assert isinstance(cugraph_equivariant.__git_commit__, str)
-
-    # __version__ should always be non-empty
-    assert isinstance(cugraph_equivariant.__version__, str)
-    assert len(cugraph_equivariant.__version__) > 0
diff --git a/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
deleted file mode 100644
index b4acfe8d090..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .scatter import scatter_reduce
-
-__all__ = [
-    "scatter_reduce",
-]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py b/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py
deleted file mode 100644
index 909fbc99365..00000000000
--- a/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-import torch
-
-
-def broadcast(src: torch.Tensor, ref: torch.Tensor, dim: int) -> torch.Tensor:
-    size = ((1,) * dim) + (-1,) + ((1,) * (ref.dim() - dim - 1))
-    return src.view(size).expand_as(ref)
-
-
-def scatter_reduce(
-    src: torch.Tensor,
-    index: torch.Tensor,
-    dim: int = 0,
-    dim_size: Optional[int] = None,  # value of out.size(dim)
-    reduce: str = "sum",  # "sum", "prod", "mean", "amax", "amin"
-):
-    # scatter() expects index to be int64
-    index = broadcast(index, src, dim).to(torch.int64)
-
-    size = list(src.size())
-
-    if dim_size is not None:
-        size[dim] = dim_size
-    else:
-        size[dim] = 0 if index.numel() == 0 else int(index.max()) + 1
-
-    out = torch.zeros(size, dtype=src.dtype, device=src.device)
-    return out.scatter_reduce_(dim, index, src, reduce, include_self=False)
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
deleted file mode 100644
index 9fb19de4c81..00000000000
--- a/python/cugraph-equivariant/pyproject.toml
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-[build-system]
-requires = [
-    "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "setuptools>=61.0.0",
-    "wheel",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "rapids_build_backend.build"
-
-[project]
-name = "cugraph-equivariant"
-dynamic = ["version"]
-description = "Fast GPU-based equivariant operations and convolutional layers."
-readme = { file = "README.md", content-type = "text/markdown" }
-authors = [
-    { name = "NVIDIA Corporation" },
-]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.10"
-classifiers = [
-    "Intended Audience :: Developers",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-]
-dependencies = [
-    "pylibcugraphops==25.2.*,>=0.0.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.urls]
-Homepage = "https://github.com/rapidsai/cugraph"
-Documentation = "https://docs.rapids.ai/api/cugraph/stable/api_docs/cugraph-ops/"
-
-[project.optional-dependencies]
-test = [
-    "pandas",
-    "pytest",
-    "pytest-benchmark",
-    "pytest-cov",
-    "pytest-xdist",
-    "scipy",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cugraph_equivariant/VERSION"}
-
-[tool.setuptools.packages.find]
-include = [
-    "cugraph_equivariant*",
-    "cugraph_equivariant.*",
-]
-
-[tool.rapids-build-backend]
-build-backend = "setuptools.build_meta"
-dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
-
-[tool.pydistcheck]
-select = [
-    "distro-too-large-compressed",
-]
-
-# PyPI limit is 100 MiB, fail CI before we get too close to that
-max_allowed_size_compressed = '75M'
diff --git a/python/cugraph-equivariant/setup.py b/python/cugraph-equivariant/setup.py
deleted file mode 100644
index acd0df3f717..00000000000
--- a/python/cugraph-equivariant/setup.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from setuptools import find_packages, setup
-
-if __name__ == "__main__":
-    packages = find_packages(include=["cugraph_equivariant*"])
-    setup(
-        package_data={key: ["VERSION"] for key in packages},
-    )
diff --git a/python/cugraph-pyg/LICENSE b/python/cugraph-pyg/LICENSE
deleted file mode 120000
index 30cff7403da..00000000000
--- a/python/cugraph-pyg/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
deleted file mode 100644
index bd38c8ad62e..00000000000
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- dglteam/label/th23_cu118
-- conda-forge
-- nvidia
-dependencies:
-- cugraph==25.2.*,>=0.0.0a0
-- pandas
-- pre-commit
-- pylibcugraphops==25.2.*,>=0.0.0a0
-- pytest
-- pytest-benchmark
-- pytest-cov
-- pytest-xdist
-- pytorch-cuda==11.8
-- pytorch>=2.3,<2.4.0a0
-- pytorch_geometric>=2.5,<2.6
-- scipy
-- tensordict>=0.1.2
-name: cugraph_pyg_dev_cuda-118
diff --git a/python/cugraph-pyg/cugraph_pyg/VERSION b/python/cugraph-pyg/cugraph_pyg/VERSION
deleted file mode 120000
index d62dc733efd..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../../VERSION
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/__init__.py b/python/cugraph-pyg/cugraph_pyg/__init__.py
deleted file mode 100644
index e566e6e9fdd..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph_pyg._version import __git_commit__, __version__
-
-import cugraph_pyg.data
-import cugraph_pyg.loader
-import cugraph_pyg.sampler
-import cugraph_pyg.nn
diff --git a/python/cugraph-pyg/cugraph_pyg/_version.py b/python/cugraph-pyg/cugraph_pyg/_version.py
deleted file mode 100644
index 053b163116d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/_version.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import importlib.resources
-
-# Read VERSION file from the module that is symlinked to VERSION file
-# in the root of the repo at build time or copied to the moudle at
-# installation. VERSION is a separate file that allows CI build-time scripts
-# to update version info (including commit hashes) without modifying
-# source files.
-__version__ = (
-    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
-)
-try:
-    __git_commit__ = (
-        importlib.resources.files(__package__)
-        .joinpath("GIT_COMMIT")
-        .read_text()
-        .strip()
-    )
-except FileNotFoundError:
-    __git_commit__ = ""
-
-__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
deleted file mode 100644
index 6d51fd5ea01..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph_pyg.data.dask_graph_store import DaskGraphStore
-from cugraph_pyg.data.graph_store import GraphStore
-from cugraph_pyg.data.feature_store import (
-    TensorDictFeatureStore,
-    WholeFeatureStore,
-)
-
-
-def CuGraphStore(*args, **kwargs):
-    warnings.warn("CuGraphStore has been renamed to DaskGraphStore", FutureWarning)
-    return DaskGraphStore(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
deleted file mode 100644
index 6195f3118a4..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
+++ /dev/null
@@ -1,1321 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Any, Union, List, Dict
-
-from enum import Enum, auto
-
-from dataclasses import dataclass
-from collections import defaultdict
-from itertools import chain
-from functools import cached_property
-
-import numpy as np
-import cupy
-import pandas
-import cudf
-import cugraph
-import warnings
-
-import dask.array as dar
-import dask.dataframe as dd
-import dask.distributed as distributed
-import dask_cudf
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-Tensor = None if isinstance(torch, MissingModule) else torch.Tensor
-NdArray = None if isinstance(cupy, MissingModule) else cupy.ndarray
-DaskCudfSeries = None if isinstance(dask_cudf, MissingModule) else dask_cudf.Series
-
-TensorType = Union[Tensor, NdArray, cudf.Series, DaskCudfSeries]
-NodeType = (
-    None
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.typing.NodeType
-)
-EdgeType = (
-    None
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.typing.EdgeType
-)
-
-
-class EdgeLayout(Enum):
-    COO = "coo"
-    CSC = "csc"
-    CSR = "csr"
-
-
-@dataclass
-class CuGraphEdgeAttr:
-    """
-    Defines the attributes of an :obj:`GraphStore` edge.
-    """
-
-    # The type of the edge
-    edge_type: Optional[Any]
-
-    # The layout of the edge representation
-    layout: EdgeLayout
-
-    # Whether the edge index is sorted, by destination node. Useful for
-    # avoiding sorting costs when performing neighbor sampling, and only
-    # meaningful for COO (CSC and CSR are sorted by definition)
-    is_sorted: bool = False
-
-    # The number of nodes in this edge type. If set to None, will attempt to
-    # infer with the simple heuristic int(self.edge_index.max()) + 1
-    size: Optional[Tuple[int, int]] = None
-
-    # NOTE we define __post_init__ to force-cast layout
-    def __post_init__(self):
-        self.layout = EdgeLayout(self.layout)
-
-    @classmethod
-    def cast(cls, *args, **kwargs):
-        """
-        Cast to a CuGraphTensorAttr from a tuple, list, or dict.
-
-        Returns
-        -------
-        CuGraphTensorAttr
-            contains the data of the tuple, list, or dict passed in
-        """
-        if len(args) == 1 and len(kwargs) == 0:
-            elem = args[0]
-            if elem is None:
-                return None
-            if isinstance(elem, CuGraphEdgeAttr):
-                return elem
-            if isinstance(elem, (tuple, list)):
-                return cls(*elem)
-            if isinstance(elem, dict):
-                return cls(**elem)
-        return cls(*args, **kwargs)
-
-
-class _field_status(Enum):
-    UNSET = auto()
-
-
-@dataclass
-class CuGraphTensorAttr:
-    """
-    Defines the attributes of a class:`FeatureStore` tensor; in particular,
-    all the parameters necessary to uniquely identify a tensor from the feature
-    store.
-
-    Note that the order of the attributes is important; this is the order in
-    which attributes must be provided for indexing calls. Feature store
-    implementor classes can define a different ordering by overriding
-    :meth:`TensorAttr.__init__`.
-    """
-
-    # The group name that the tensor corresponds to. Defaults to UNSET.
-    group_name: Optional[str] = _field_status.UNSET
-
-    # The name of the tensor within its group. Defaults to UNSET.
-    attr_name: Optional[str] = _field_status.UNSET
-
-    # The node indices the rows of the tensor correspond to. Defaults to UNSET.
-    index: Optional[Any] = _field_status.UNSET
-
-    # The properties in the FeatureStore the rows of the tensor correspond to.
-    # Defaults to UNSET.
-    properties: Optional[Any] = _field_status.UNSET
-
-    # The datatype of the tensor.  Defaults to UNSET.
-    dtype: Optional[Any] = _field_status.UNSET
-
-    # Convenience methods
-
-    def is_set(self, key):
-        """
-        Whether an attribute is set in :obj:`TensorAttr`.
-        """
-        if key not in self.__dataclass_fields__:
-            raise KeyError(key)
-        attr = getattr(self, key)
-        return type(attr) is not _field_status or attr != _field_status.UNSET
-
-    def is_fully_specified(self):
-        """
-        Whether the :obj:`TensorAttr` has no unset fields.
-        """
-        return all([self.is_set(key) for key in self.__dataclass_fields__])
-
-    def fully_specify(self):
-        """
-        Sets all :obj:`UNSET` fields to :obj:`None`.
-        """
-        for key in self.__dataclass_fields__:
-            if not self.is_set(key):
-                setattr(self, key, None)
-        return self
-
-    def update(self, attr):
-        """
-        Updates an :class:`TensorAttr` with set attributes from another
-        :class:`TensorAttr`.
-        """
-        for key in self.__dataclass_fields__:
-            if attr.is_set(key):
-                setattr(self, key, getattr(attr, key))
-
-    @classmethod
-    def cast(cls, *args, **kwargs):
-        """
-        Casts to a CuGraphTensorAttr from a tuple, list, or dict
-
-        Returns
-        -------
-        CuGraphTensorAttr
-            contains the data of the tuple, list, or dict passed in
-        """
-        if len(args) == 1 and len(kwargs) == 0:
-            elem = args[0]
-            if elem is None:
-                return None
-            if isinstance(elem, CuGraphTensorAttr):
-                return elem
-            if isinstance(elem, (tuple, list)):
-                return cls(*elem)
-            if isinstance(elem, dict):
-                return cls(**elem)
-        return cls(*args, **kwargs)
-
-
-class DaskGraphStore:
-    """
-    Duck-typed version of PyG's GraphStore and FeatureStore that uses
-    Dask to distribute the graph structure across GPUs and a
-    cugraph.gnn.FeatureStore to store node/edge features.  Supports
-    single-node/single-GPU, single-node/multi-GPU, and multi-node/multi-GPU
-    configurations.  Supports both homogeneous and heterogeneous graphs.
-    """
-
-    # TODO allow (and possibly require) separate stores for node, edge attrs
-    # For now edge attrs are entirely unsupported.
-    # TODO add an "expensive check" argument that ensures the graph store
-    # and feature store are valid and compatible with PyG.
-    def __init__(
-        self,
-        F: cugraph.gnn.FeatureStore,
-        G: Union[
-            Dict[Tuple[str, str, str], Tuple[TensorType]],
-            Dict[Tuple[str, str, str], int],
-        ],
-        num_nodes_dict: Dict[str, int],
-        *,
-        multi_gpu: bool = False,
-        order: str = "CSR",
-    ):
-        """
-        Constructs a new DaskGraphStore from the provided
-        arguments.
-
-        Parameters
-        ----------
-        F: cugraph.gnn.FeatureStore (Required)
-            The feature store containing this graph's features.
-            Typed lexicographic-ordered numbering convention
-            should match that of the graph.
-
-        G: dict[str, tuple[TensorType]] or dict[str, int] (Required)
-            Dictionary of edge indices.
-            Option 1 (graph in memory):
-
-                Pass the edge indices: i.e.
-                {
-                ('author', 'writes', 'paper'): [[0,1,2],[2,0,1]],
-                ('author', 'affiliated', 'institution'): [[0,1],[0,1]]
-                }
-
-
-            Option 2 (graph not in memory):
-
-                Pass the number of edges: i.e.
-                {
-                ('author', 'writes', 'paper'): 2,
-                ('author', 'affiliated', 'institution'): 2
-                }
-                If the graph is not in memory, manipulating the edge indices
-                or calling sampling is not possible.  This is for cases where
-                sampling has already been done and samples were written to disk.
-
-            Note: the internal cugraph representation will use
-            offsetted vertex and edge ids.
-
-        num_nodes_dict: dict (Required)
-            A dictionary mapping each node type to the count of nodes
-            of that type in the graph.
-
-        multi_gpu: bool (Optional, default = False)
-            Whether the store should be backed by a multi-GPU graph.
-            Requires dask to have been set up.
-
-        order: str (Optional ["CSR", "CSC"], default = CSR)
-            The order to use for sampling.  CSR corresponds to the
-            standard OGB dataset order that is usually used in PyG.
-            CSC order constructs the same graph as CSR, but with
-            edges in the opposite direction.
-        """
-
-        if None in G:
-            raise ValueError("Unspecified edge types not allowed in PyG")
-
-        if order != "CSR" and order != "CSC":
-            raise ValueError("invalid valid for order")
-
-        self.__vertex_dtype = torch.int64
-
-        self._tensor_attr_cls = CuGraphTensorAttr
-        self._tensor_attr_dict = defaultdict(list)
-
-        construct_graph = True
-        if isinstance(next(iter(G.values())), int):
-            # User has passed in the number of edges
-            # (not the actual edge index), so the number of edges
-            # does not need to be counted.
-            num_edges_dict = dict(G)  # make sure the cugraph store owns this dict
-            construct_graph = False
-        else:
-            # User has passed in the actual edge index, so the
-            # number of edges needs to be counted.
-            num_edges_dict = {
-                pyg_can_edge_type: len(ei[0]) for pyg_can_edge_type, ei in G.items()
-            }
-
-        self.__infer_offsets(num_nodes_dict, num_edges_dict)
-        self.__infer_existing_tensors(F)
-        self.__infer_edge_types(num_nodes_dict, num_edges_dict)
-
-        self._edge_attr_cls = CuGraphEdgeAttr
-
-        self.__features = F
-        self.__graph = None
-        self.__is_graph_owner = False
-        self.__order = order
-
-        if construct_graph:
-            if multi_gpu:
-                self.__graph = distributed.get_client().get_dataset(
-                    "cugraph_graph", default=None
-                )
-
-            if self.__graph is None:
-                self.__graph = self.__construct_graph(
-                    G, multi_gpu=multi_gpu, order=order
-                )
-                self.__is_graph_owner = True
-
-        self.__subgraphs = {}
-
-    def __del__(self):
-        if self.__is_graph_owner:
-            if isinstance(self.__graph._plc_graph, dict):
-                try:
-                    distributed.get_client().unpublish_dataset("cugraph_graph")
-                except TypeError:
-                    warnings.warn(
-                        "Could not unpublish graph dataset, most likely because"
-                        " dask has already shut down."
-                    )
-            del self.__graph
-
-    def __make_offsets(self, input_dict):
-        offsets = {}
-        offsets["stop"] = [input_dict[v] for v in sorted(input_dict.keys())]
-        offsets["stop"] = torch.tensor(offsets["stop"]).cuda()
-
-        cumsum = offsets["stop"].cumsum(0)
-        offsets["start"] = cumsum - offsets["stop"]
-        offsets["stop"] = cumsum - 1
-
-        offsets["type"] = np.array(sorted(input_dict.keys()))
-
-        return offsets
-
-    def __infer_offsets(
-        self,
-        num_nodes_dict: Dict[str, int],
-        num_edges_dict: Dict[Tuple[str, str, str], int],
-    ) -> None:
-        """
-        Sets the vertex offsets for this store.
-        """
-        self.__vertex_type_offsets = self.__make_offsets(num_nodes_dict)
-
-        # Need to convert tuples to string in order to use searchsorted
-        # Can convert back using x.split('__')
-        # Lexicographic ordering is unchanged.
-        self.__edge_type_offsets = self.__make_offsets(
-            {
-                "__".join(pyg_can_edge_type): n
-                for pyg_can_edge_type, n in num_edges_dict.items()
-            }
-        )
-
-    def __dask_array_from_numpy(self, array: np.ndarray, npartitions: int):
-        return dar.from_array(
-            array,
-            meta=np.array([], dtype=array.dtype),
-            chunks=max(1, len(array) // npartitions),
-        )
-
-    def __construct_graph(
-        self,
-        edge_info: Dict[Tuple[str, str, str], List[TensorType]],
-        multi_gpu: bool = False,
-        order: str = "CSC",
-    ) -> cugraph.MultiGraph:
-        """
-        This function takes edge information and uses it to construct
-        a cugraph Graph.  It determines the numerical edge type by
-        sorting the keys of the input dictionary
-        (the canonical edge types).
-
-        Parameters
-        ----------
-        edge_info: Dict[Tuple[str, str, str], List[TensorType]] (Required)
-            Input edge info dictionary, where keys are the canonical
-            edge type and values are the edge index (src/dst).
-
-        multi_gpu: bool (Optional, default=False)
-            Whether to construct a single-GPU or multi-GPU cugraph Graph.
-            Defaults to a single-GPU graph.
-
-        order: str (CSC or CSR)
-            Essentially whether to reverse edges so that the cuGraph
-            sampling algorithm operates on the CSC matrix instead of
-            the CSR matrix.  Should nearly always be CSC unless there
-            is a specific expectation of reverse sampling, or correctness
-            testing is being performed.
-
-        Returns
-        -------
-        A newly-constructed directed cugraph.MultiGraph object.
-        """
-
-        # Ensure the original dict is not modified.
-        edge_info_cg = {}
-
-        if order != "CSR" and order != "CSC":
-            raise ValueError("Order must be either CSC (default) or CSR!")
-
-        # Iterate over the keys in sorted order so that the created
-        # numerical types correspond to the lexicographic order
-        # of the keys, which is critical to converting the numeric
-        # keys back to canonical edge types later.
-        # FIXME don't always convert to host arrays (#3383)
-        for pyg_can_edge_type in sorted(edge_info.keys()):
-            src_type, _, dst_type = pyg_can_edge_type
-            srcs, dsts = edge_info[pyg_can_edge_type]
-
-            src_offset = np.searchsorted(self.__vertex_type_offsets["type"], src_type)
-            srcs_t = srcs + int(self.__vertex_type_offsets["start"][src_offset])
-            if isinstance(srcs_t, torch.Tensor):
-                srcs_t = srcs_t.cpu()
-            else:
-                if isinstance(srcs_t, dask_cudf.Series):
-                    srcs_t = srcs_t.compute()
-                if isinstance(srcs_t, cudf.Series):
-                    srcs_t = srcs_t.values_host
-
-            dst_offset = np.searchsorted(self.__vertex_type_offsets["type"], dst_type)
-            dsts_t = dsts + int(self.__vertex_type_offsets["start"][dst_offset])
-            if isinstance(dsts_t, torch.Tensor):
-                dsts_t = dsts_t.cpu()
-            else:
-                if isinstance(dsts_t, dask_cudf.Series):
-                    dsts_t = dsts_t.compute()
-                if isinstance(dsts_t, cudf.Series):
-                    dsts_t = dsts_t.values_host
-
-            edge_info_cg[pyg_can_edge_type] = (srcs_t, dsts_t)
-
-        na_src = np.concatenate(
-            [
-                edge_info_cg[pyg_can_edge_type][0]
-                for pyg_can_edge_type in sorted(edge_info_cg.keys())
-            ]
-        )
-
-        na_dst = np.concatenate(
-            [
-                edge_info_cg[pyg_can_edge_type][1]
-                for pyg_can_edge_type in sorted(edge_info_cg.keys())
-            ]
-        )
-
-        et_offsets = self.__edge_type_offsets
-        na_etp = np.concatenate(
-            [
-                np.full(
-                    int(et_offsets["stop"][i] - et_offsets["start"][i] + 1),
-                    i,
-                    dtype="int32",
-                )
-                for i in range(len(self.__edge_type_offsets["start"]))
-            ]
-        )
-
-        vertex_dtype = na_src.dtype
-
-        if multi_gpu:
-            client = distributed.get_client()
-            nworkers = len(client.scheduler_info()["workers"])
-            npartitions = nworkers * 4
-
-            src_dar = self.__dask_array_from_numpy(na_src, npartitions)
-            del na_src
-
-            dst_dar = self.__dask_array_from_numpy(na_dst, npartitions)
-            del na_dst
-
-            etp_dar = self.__dask_array_from_numpy(na_etp, npartitions)
-            del na_etp
-
-            df = dd.from_dask_array(etp_dar, columns=["etp"])
-            df["src"] = dst_dar if order == "CSC" else src_dar
-            df["dst"] = src_dar if order == "CSC" else dst_dar
-
-            del src_dar
-            del dst_dar
-            del etp_dar
-
-            if df.etp.dtype != "int32":
-                raise ValueError("Edge type must be int32!")
-
-            # Ensure the dataframe is constructed on each partition
-            # instead of adding additional synchronization head from potential
-            # host to device copies.
-            def get_empty_df():
-                return cudf.DataFrame(
-                    {
-                        "etp": cudf.Series([], dtype="int32"),
-                        "src": cudf.Series([], dtype=vertex_dtype),
-                        "dst": cudf.Series([], dtype=vertex_dtype),
-                    }
-                )
-
-            # Have to check for empty partitions and handle them appropriately
-            df = df.persist()
-            df = df.map_partitions(
-                lambda f: cudf.DataFrame.from_pandas(f)
-                if len(f) > 0
-                else get_empty_df(),
-                meta=get_empty_df(),
-            ).reset_index(
-                drop=True
-            )  # should be ok for dask
-        else:
-            df = pandas.DataFrame(
-                {
-                    "src": pandas.Series(na_dst)
-                    if order == "CSC"
-                    else pandas.Series(na_src),
-                    "dst": pandas.Series(na_src)
-                    if order == "CSC"
-                    else pandas.Series(na_dst),
-                    "etp": pandas.Series(na_etp),
-                }
-            )
-            df = cudf.from_pandas(df)
-            df.reset_index(drop=True, inplace=True)
-
-        graph = cugraph.MultiGraph(directed=True)
-        if multi_gpu:
-            graph.from_dask_cudf_edgelist(
-                df,
-                source="src",
-                destination="dst",
-                edge_type="etp",
-            )
-            distributed.get_client().publish_dataset(cugraph_graph=graph)
-        else:
-            graph.from_cudf_edgelist(
-                df,
-                source="src",
-                destination="dst",
-                edge_type="etp",
-            )
-
-        del df
-        return graph
-
-    @property
-    def _edge_types_to_attrs(self) -> dict:
-        return dict(self.__edge_types_to_attrs)
-
-    @property
-    def order(self) -> str:
-        return self.__order
-
-    @property
-    def node_types(self) -> List[NodeType]:
-        return list(self.__vertex_type_offsets["type"])
-
-    @property
-    def edge_types(self) -> List[EdgeType]:
-        return list(self.__edge_types_to_attrs.keys())
-
-    def canonical_edge_type_to_numeric(self, etype: EdgeType) -> int:
-        return np.searchsorted(self.__edge_type_offsets["type"], "__".join(etype))
-
-    def numeric_edge_type_to_canonical(self, etype: int) -> EdgeType:
-        return tuple(self.__edge_type_offsets["type"][etype].split("__"))
-
-    @cached_property
-    def _is_delayed(self):
-        if self.__graph is None:
-            return False
-        return self.__graph.is_multi_gpu()
-
-    def _numeric_vertex_type_from_name(self, vertex_type_name: str) -> int:
-        return np.searchsorted(self.__vertex_type_offsets["type"], vertex_type_name)
-
-    def get_vertex_index(self, vtypes) -> TensorType:
-        if isinstance(vtypes, str):
-            vtypes = [vtypes]
-
-        ix = torch.tensor([], dtype=torch.int64)
-
-        if isinstance(self.__vertex_type_offsets, dict):
-            vtypes = np.searchsorted(self.__vertex_type_offsets["type"], vtypes)
-        for vtype in vtypes:
-            start = int(self.__vertex_type_offsets["start"][vtype])
-            stop = int(self.__vertex_type_offsets["stop"][vtype])
-            ix = torch.concatenate(
-                [
-                    ix,
-                    torch.arange(
-                        start, stop + 1, 1, dtype=self.__vertex_dtype, device="cuda"
-                    ),
-                ]
-            )
-
-        return ix
-
-    def put_edge_index(self, edge_index, edge_attr):
-        """
-        Adds additional edges to the graph.
-        Not yet implemented.
-        """
-        raise NotImplementedError("Adding indices not supported.")
-
-    def get_all_edge_attrs(self):
-        """
-        Gets a list of all edge types and indices in this store.
-
-        Returns
-        -------
-        list[str]
-            All edge types and indices in this store.
-        """
-        return self.__edge_types_to_attrs.values()
-
-    def _get_edge_index(self, attr: CuGraphEdgeAttr) -> Tuple[TensorType, TensorType]:
-        """
-        Returns the edge index in the requested format
-        (as defined by attr).  Currently, only unsorted
-        COO is supported, which is returned as a (src,dst)
-        tuple as expected by the PyG API.
-
-        Parameters
-        ----------
-        attr: CuGraphEdgeAttr
-            The CuGraphEdgeAttr specifying the
-            desired edge type, layout (i.e. CSR, COO, CSC), and
-            whether the returned index should be sorted (if COO).
-            Currently, only unsorted COO is supported.
-
-        Returns
-        -------
-        (src, dst) : Tuple[tensor type]
-            Tuple of the requested edge index in COO form.
-            Currently, only COO form is supported.
-        """
-
-        if self.__graph is None:
-            raise ValueError("Graph is not in memory, cannot access edge index!")
-
-        if attr.layout != EdgeLayout.COO:
-            # TODO support returning CSR/CSC (Issue #3802)
-            raise TypeError("Only COO direct access is supported!")
-
-        # Currently, graph creation enforces that input vertex ids are always of
-        # integer type.  Therefore, it is currently safe to assume that for MG
-        # graphs, the src/dst col names are renumbered_src/dst
-        # and for SG graphs, the src/dst col names are src/dst.
-        # This may change in the future if/when renumbering or the graph
-        # creation process is refactored.
-        # See Issue #3201 for more details.
-        # Also note src/dst are flipped so that cuGraph sampling is done in
-        # CSC format rather than CSR format.
-        if self._is_delayed:
-            dst_col_name = self.__graph.renumber_map.renumbered_src_col_name
-            src_col_name = self.__graph.renumber_map.renumbered_dst_col_name
-        else:
-            dst_col_name = self.__graph.srcCol
-            src_col_name = self.__graph.dstCol
-
-        # If there is only one edge type (homogeneous graph) then
-        # bypass the edge filters for a significant speed improvement.
-        if len(self.__edge_types_to_attrs) == 1:
-            if attr.edge_type not in self.__edge_types_to_attrs:
-                raise ValueError(
-                    f"Requested edge type {attr.edge_type}" "is not present in graph."
-                )
-
-            df = self.__graph.edgelist.edgelist_df[[src_col_name, dst_col_name]]
-            src_offset = 0
-            dst_offset = 0
-        else:
-            src_type, _, dst_type = attr.edge_type
-            src_offset = int(
-                self.__vertex_type_offsets["start"][
-                    self._numeric_vertex_type_from_name(src_type)
-                ]
-            )
-            dst_offset = int(
-                self.__vertex_type_offsets["start"][
-                    self._numeric_vertex_type_from_name(dst_type)
-                ]
-            )
-            coli = np.searchsorted(
-                self.__edge_type_offsets["type"], "__".join(attr.edge_type)
-            )
-
-            df = self.__graph.edgelist.edgelist_df[
-                [src_col_name, dst_col_name, self.__graph.edgeTypeCol]
-            ]
-            df = df[df[self.__graph.edgeTypeCol] == coli]
-            df = df[[src_col_name, dst_col_name]]
-
-        if self._is_delayed:
-            df = df.compute()
-
-        src = torch.as_tensor(df[src_col_name], device="cuda") - src_offset
-        dst = torch.as_tensor(df[dst_col_name], device="cuda") - dst_offset
-
-        src = src.to(self.__vertex_dtype)
-        dst = dst.to(self.__vertex_dtype)
-
-        if src.shape[0] != dst.shape[0]:
-            raise IndexError("src and dst shape do not match!")
-
-        return (src, dst)
-
-    def get_edge_index(self, *args, **kwargs) -> Tuple[TensorType, TensorType]:
-        """
-        Synchronously gets an edge_index tensor from the materialized
-        graph.
-
-        Args:
-            **attr(EdgeAttr): the edge attributes.
-
-        Returns:
-            EdgeTensorType: an edge_index tensor corresonding to the provided
-            attributes, or None if there is no such tensor.
-
-        Raises:
-            KeyError: if the edge index corresponding to attr was not found.
-        """
-
-        edge_attr = self._edge_attr_cls.cast(*args, **kwargs)
-        edge_attr.layout = EdgeLayout(edge_attr.layout)
-        # Override is_sorted for CSC and CSR:
-        # TODO treat is_sorted specially in this function, where is_sorted=True
-        # returns an edge index sorted by column.
-        edge_attr.is_sorted = edge_attr.is_sorted or (
-            edge_attr.layout in [EdgeLayout.CSC, EdgeLayout.CSR]
-        )
-        edge_index = self._get_edge_index(edge_attr)
-        if edge_index is None:
-            raise KeyError(f"An edge corresponding to '{edge_attr}' was not " f"found")
-        return edge_index
-
-    def _subgraph(self, edge_types: List[tuple] = None) -> cugraph.MultiGraph:
-        """
-        Returns a subgraph with edges limited to those of a given type
-
-        Parameters
-        ----------
-        edge_types : list of pyg canonical edge types
-            Directly references the graph's internal edge types.  Does
-            not accept PyG edge type tuples.
-
-        Returns
-        -------
-        The appropriate extracted subgraph.  Will extract the subgraph
-        if it has not already been extracted.
-
-        """
-        if self.__graph is None:
-            raise ValueError("Graph is not in memory, cannot get subgraph")
-
-        if edge_types is not None and set(edge_types) != set(
-            self.__edge_types_to_attrs.keys()
-        ):
-            raise ValueError(
-                "Subgraphing is currently unsupported, please"
-                " specify all edge types in the graph or leave"
-                " this argument empty."
-            )
-
-        return self.__graph
-
-    def _get_vertex_groups_from_sample(
-        self, nodes_of_interest: TensorType, is_sorted: bool = False
-    ) -> Dict[str, torch.Tensor]:
-        """
-        Given a tensor of nodes of interest, this
-        method a single dictionary, noi_index.
-
-        noi_index is the original vertex ids grouped by vertex type.
-
-        Example Input: [5, 2, 1, 10, 11, 8]
-        Output: {'red_vertex': [5, 1, 8], 'blue_vertex': [2], 'green_vertex': [10, 11]}
-
-        """
-
-        noi_index = {}
-
-        vtypes = cudf.Series(self.__vertex_type_offsets["type"])
-        if len(vtypes) == 1:
-            noi_index[vtypes.iloc[0]] = nodes_of_interest
-        else:
-            noi_type_indices = torch.searchsorted(
-                torch.as_tensor(self.__vertex_type_offsets["stop"], device="cuda"),
-                nodes_of_interest,
-            )
-
-            noi_types = vtypes.iloc[cupy.asarray(noi_type_indices)].reset_index(
-                drop=True
-            )
-            noi_starts = self.__vertex_type_offsets["start"][noi_type_indices]
-
-            noi_types = cudf.Series(noi_types, name="t").groupby("t").groups
-
-            for type_name, ix in noi_types.items():
-                # store the renumbering for this vertex type
-                # renumbered vertex id is the index of the old id
-                ix = torch.as_tensor(ix, device="cuda")
-                # subtract off the offsets
-                noi_index[type_name] = nodes_of_interest[ix] - noi_starts[ix]
-
-        return noi_index
-
-    def _get_sample_from_vertex_groups(
-        self, vertex_groups: Dict[str, TensorType]
-    ) -> TensorType:
-        """
-        Inverse of _get_vertex_groups_from_sample() (although with de-offsetted ids).
-        Given a dictionary of node types and de-offsetted node ids, return
-        the global (non-renumbered) vertex ids.
-
-        Example Input: {'horse': [1, 3, 5], 'duck': [1, 2]}
-        Output: [1, 3, 5, 14, 15]
-        """
-        t = torch.tensor([], dtype=torch.int64, device="cuda")
-
-        for group_name, ix in vertex_groups.items():
-            type_id = self._numeric_vertex_type_from_name(group_name)
-            if not ix.is_cuda:
-                ix = ix.cuda()
-            offset = self.__vertex_type_offsets["start"][type_id]
-            u = ix + offset
-            t = torch.concatenate([t, u])
-
-        return t
-
-    def _get_renumbered_edge_groups_from_sample(
-        self, sampling_results: cudf.DataFrame, noi_index: dict
-    ) -> Tuple[
-        Dict[Tuple[str, str, str], torch.Tensor],
-        Tuple[Dict[Tuple[str, str, str], torch.Tensor]],
-    ]:
-        """
-        Given a cudf (NOT dask_cudf) DataFrame of sampling results and a dictionary
-        of non-renumbered vertex ids grouped by vertex type, this method
-        outputs two dictionaries:
-            1. row_dict
-            2. col_dict
-        (1) row_dict corresponds to the renumbered source vertex ids grouped
-            by PyG edge type - (src, type, dst) tuple.
-        (2) col_dict corresponds to the renumbered destination vertex ids grouped
-            by PyG edge type (src, type, dst) tuple.
-        * The two outputs combined make a PyG "edge index".
-        * The ith element of each array corresponds to the same edge.
-        * The _get_vertex_groups_from_sample() method is usually called
-          before this one to get the noi_index.
-
-        Example Input: Series({
-                'majors': [0, 5, 11, 3],
-                'minors': [8, 2, 3, 5]},
-                'edge_type': [1, 3, 5, 14]
-            }),
-            {
-                'blue_vertex': [0, 5],
-                'red_vertex': [3, 11],
-                'green_vertex': [2, 8]
-            }
-        Output: {
-                ('blue', 'etype1', 'green'): [0, 1],
-                ('red', 'etype2', 'red'): [1],
-                ('red', 'etype3', 'blue'): [0]
-            },
-            {
-                ('blue', 'etype1', 'green'): [1, 0],
-                ('red', 'etype2', 'red'): [0],
-                ('red', 'etype3', 'blue'): [1]
-            }
-
-        """
-        row_dict = {}
-        col_dict = {}
-        # If there is only 1 edge type (includes heterogeneous graphs)
-        if len(self.edge_types) == 1:
-            t_pyg_type = list(self.__edge_types_to_attrs.values())[0].edge_type
-            src_type, _, dst_type = t_pyg_type
-
-            # If there is only 1 node type (homogeneous)
-            # This should only occur if the cuGraph loader was
-            # not used.  This logic is deprecated.
-            if len(self.node_types) == 1:
-                warnings.warn(
-                    "Renumbering after sampling for homogeneous graphs is deprecated.",
-                    FutureWarning,
-                )
-
-                # Create a dataframe mapping old ids to new ids.
-                vtype = src_type
-                id_table = noi_index[vtype]
-                id_map = cudf.Series(
-                    cupy.arange(id_table.shape[0], dtype="int32"),
-                    name="new_id",
-                    index=cupy.asarray(id_table),
-                ).sort_index()
-
-                # Renumber the majors using binary search
-                # Step 1: get the index of the new id
-                ix_r = torch.searchsorted(
-                    torch.as_tensor(id_map.index.values, device="cuda"),
-                    torch.as_tensor(sampling_results.majors.values, device="cuda"),
-                )
-                # Step 2: Go from id indices to actual ids
-                row_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
-                    ix_r
-                ]
-
-                # Renumber the minors using binary search
-                # Step 1: get the index of the new id
-                ix_c = torch.searchsorted(
-                    torch.as_tensor(id_map.index.values, device="cuda"),
-                    torch.as_tensor(sampling_results.minors.values, device="cuda"),
-                )
-                # Step 2: Go from id indices to actual ids
-                col_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
-                    ix_c
-                ]
-            else:
-                # Handle the heterogeneous case where there is only 1 edge type
-                dst_id_table = noi_index[dst_type]
-                dst_id_map = cudf.DataFrame(
-                    {
-                        "dst": cupy.asarray(dst_id_table),
-                        "new_id": cupy.arange(dst_id_table.shape[0]),
-                    }
-                ).set_index("dst")
-                dst = dst_id_map["new_id"].loc[sampling_results.minors]
-                col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
-
-                src_id_table = noi_index[src_type]
-                src_id_map = cudf.DataFrame(
-                    {
-                        "src": cupy.asarray(src_id_table),
-                        "new_id": cupy.arange(src_id_table.shape[0]),
-                    }
-                ).set_index("src")
-                src = src_id_map["new_id"].loc[sampling_results.majors]
-                row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
-
-        else:
-            # This will retrieve the single string representation.
-            # It needs to be converted to a tuple in the for loop below.
-            eoi_types = (
-                cudf.Series(self.__edge_type_offsets["type"])
-                .iloc[sampling_results.edge_type.astype("int32")]
-                .reset_index(drop=True)
-            )
-
-            eoi_types = cudf.Series(eoi_types, name="t").groupby("t").groups
-
-            for pyg_can_edge_type_str, ix in eoi_types.items():
-                pyg_can_edge_type = tuple(pyg_can_edge_type_str.split("__"))
-
-                if self.__order == "CSR":
-                    src_type, _, dst_type = pyg_can_edge_type
-                else:  # CSC
-                    dst_type, _, src_type = pyg_can_edge_type
-
-                # Get the de-offsetted minors
-                dst_num_type = self._numeric_vertex_type_from_name(dst_type)
-                minors = torch.as_tensor(
-                    sampling_results.minors.iloc[ix].values, device="cuda"
-                )
-                minors -= self.__vertex_type_offsets["start"][dst_num_type]
-
-                # Create the col entry for this type
-                dst_id_table = noi_index[dst_type]
-                dst_id_map = (
-                    cudf.Series(cupy.asarray(dst_id_table), name="dst")
-                    .reset_index()
-                    .rename(columns={"index": "new_id"})
-                    .set_index("dst")
-                )
-                dst = dst_id_map["new_id"].loc[cupy.asarray(minors)]
-                col_dict[pyg_can_edge_type] = torch.as_tensor(dst.values, device="cuda")
-
-                # Get the de-offsetted majors
-                src_num_type = self._numeric_vertex_type_from_name(src_type)
-                majors = torch.as_tensor(
-                    sampling_results.majors.iloc[ix].values, device="cuda"
-                )
-                majors -= self.__vertex_type_offsets["start"][src_num_type]
-
-                # Create the row entry for this type
-                src_id_table = noi_index[src_type]
-                src_id_map = (
-                    cudf.Series(cupy.asarray(src_id_table), name="src")
-                    .reset_index()
-                    .rename(columns={"index": "new_id"})
-                    .set_index("src")
-                )
-                src = src_id_map["new_id"].loc[cupy.asarray(majors)]
-                row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
-
-        return row_dict, col_dict
-
-    def put_tensor(self, tensor, attr) -> None:
-        raise NotImplementedError("Adding properties not supported.")
-
-    def create_named_tensor(
-        self, attr_name: str, properties: List[str], vertex_type: str, dtype: str
-    ) -> None:
-        """
-        Create a named tensor that contains a subset of
-        properties in the graph.
-
-        Parameters
-        ----------
-        attr_name : str
-            The name of the tensor within its group.
-        properties : list[str]
-            The properties the rows
-            of the tensor correspond to.
-        vertex_type : str
-            The vertex type associated with this new tensor property.
-        dtype : numpy/cupy dtype (i.e. 'int32') or torch dtype (i.e. torch.float)
-            The datatype of the tensor.  Usually float32/float64.
-        """
-        self._tensor_attr_dict[vertex_type].append(
-            CuGraphTensorAttr(
-                vertex_type, attr_name, properties=properties, dtype=dtype
-            )
-        )
-
-    def __infer_edge_types(
-        self,
-        num_nodes_dict: Dict[str, int],
-        num_edges_dict: Dict[Tuple[str, str, str], int],
-    ) -> None:
-        self.__edge_types_to_attrs = {}
-
-        for pyg_can_edge_type in sorted(num_edges_dict.keys()):
-            sz_src = num_nodes_dict[pyg_can_edge_type[0]]
-            sz_dst = num_nodes_dict[pyg_can_edge_type[-1]]
-            self.__edge_types_to_attrs[pyg_can_edge_type] = CuGraphEdgeAttr(
-                edge_type=pyg_can_edge_type,
-                layout=EdgeLayout.COO,
-                is_sorted=False,
-                size=(sz_src, sz_dst),
-            )
-
-    def __infer_existing_tensors(self, F) -> None:
-        """
-        Infers the tensor attributes/features.
-        """
-        for attr_name, types_with_attr in F.get_feature_list().items():
-            for vt in types_with_attr:
-                attr_dtype = F.get_data(np.array([0]), vt, attr_name).dtype
-                self.create_named_tensor(
-                    attr_name=attr_name,
-                    properties=None,
-                    vertex_type=vt,
-                    dtype=attr_dtype,
-                )
-
-    def get_all_tensor_attrs(self) -> List[CuGraphTensorAttr]:
-        """
-        Obtains all tensor attributes stored in this feature store.
-        """
-        # unpack and return the list of lists
-        it = chain.from_iterable(self._tensor_attr_dict.values())
-        return [CuGraphTensorAttr.cast(c) for c in it]
-
-    def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
-        feature_backend = self.__features.backend
-        cols = attr.properties
-
-        idx = attr.index
-        if idx is not None:
-            if feature_backend in ["torch", "wholegraph"]:
-                if not isinstance(idx, torch.Tensor):
-                    raise TypeError(
-                        f"Type {type(idx)} invalid"
-                        f" for feature store backend {feature_backend}"
-                    )
-            elif feature_backend == "numpy":
-                # allow feature indexing through cupy arrays
-                if isinstance(idx, cupy.ndarray):
-                    idx = idx.get()
-                elif isinstance(idx, torch.Tensor):
-                    idx = np.asarray(idx.cpu())
-
-        if cols is None:
-            t = self.__features.get_data(idx, attr.group_name, attr.attr_name)
-            if idx is None:
-                t = t[-1]
-
-            if isinstance(t, np.ndarray):
-                t = torch.as_tensor(t, device="cpu")
-
-            return t
-
-        else:
-            t = self.__features.get_data(idx, attr.group_name, cols[0])
-
-            if len(t.shape) == 1:
-                t = torch.tensor([t])
-
-            for col in cols[1:]:
-                u = self.__features.get_data(idx, attr.group_name, col)
-
-                if len(u.shape) == 1:
-                    u = torch.tensor([u])
-
-                t = torch.concatenate([t, u])
-
-            return t
-
-    def _multi_get_tensor(self, attrs: List[CuGraphTensorAttr]) -> List[TensorType]:
-        return [self._get_tensor(attr) for attr in attrs]
-
-    def multi_get_tensor(self, attrs: List[CuGraphTensorAttr]) -> List[TensorType]:
-        """
-        Synchronously obtains a :class:`FeatureTensorType` object from the
-        feature store for each tensor associated with the attributes in
-        `attrs`.
-
-        Parameters
-        ----------
-        attrs (List[TensorAttr]): a list of :class:`TensorAttr` attributes
-        that identify the tensors to get.
-
-        Returns
-        -------
-        List[FeatureTensorType]: a Tensor of the same type as the index for
-        each attribute.
-
-        Raises
-        ------
-            KeyError: if a tensor corresponding to an attr was not found.
-            ValueError: if any input `TensorAttr` is not fully specified.
-        """
-        attrs = [
-            self._infer_unspecified_attr(self._tensor_attr_cls.cast(attr))
-            for attr in attrs
-        ]
-        bad_attrs = [attr for attr in attrs if not attr.is_fully_specified()]
-        if len(bad_attrs) > 0:
-            raise ValueError(
-                f"The input TensorAttr(s) '{bad_attrs}' are not fully "
-                f"specified. Please fully specify them by specifying all "
-                f"'UNSET' fields"
-            )
-
-        tensors = self._multi_get_tensor(attrs)
-
-        bad_attrs = [attrs[i] for i, v in enumerate(tensors) if v is None]
-        if len(bad_attrs) > 0:
-            raise KeyError(
-                f"Tensors corresponding to attributes " f"'{bad_attrs}' were not found"
-            )
-
-        return [tensor for attr, tensor in zip(attrs, tensors)]
-
-    def get_tensor(self, *args, **kwargs) -> TensorType:
-        """
-        Synchronously obtains a :class:`FeatureTensorType` object from the
-        feature store. Feature store implementors guarantee that the call
-        :obj:`get_tensor(put_tensor(tensor, attr), attr) = tensor` holds.
-
-        Parameters
-        ----------
-        **attr (TensorAttr): Any relevant tensor attributes that correspond
-            to the feature tensor. See the :class:`TensorAttr`
-            documentation for required and optional attributes. It is the
-            job of implementations of a :class:`FeatureStore` to store this
-            metadata in a meaningful way that allows for tensor retrieval
-            from a :class:`TensorAttr` object.
-
-        Returns
-        -------
-        FeatureTensorType: a Tensor of the same type as the index.
-
-        Raises
-        ------
-        KeyError: if the tensor corresponding to attr was not found.
-        ValueError: if the input `TensorAttr` is not fully specified.
-        """
-
-        attr = self._tensor_attr_cls.cast(*args, **kwargs)
-        attr = self._infer_unspecified_attr(attr)
-
-        if not attr.is_fully_specified():
-            raise ValueError(
-                f"The input TensorAttr '{attr}' is not fully "
-                f"specified. Please fully specify the input by "
-                f"specifying all 'UNSET' fields."
-            )
-
-        tensor = self._get_tensor(attr)
-        if tensor is None:
-            raise KeyError(f"A tensor corresponding to '{attr}' was not found")
-        return tensor
-
-    def _get_tensor_size(self, attr: CuGraphTensorAttr) -> Union[List, int]:
-        return self._get_tensor(attr).size()
-
-    def get_tensor_size(self, *args, **kwargs) -> Union[List, int]:
-        """
-        Obtains the size of a tensor given its attributes, or :obj:`None`
-        if the tensor does not exist.
-        """
-        attr = self._tensor_attr_cls.cast(*args, **kwargs)
-        if not attr.is_set("index"):
-            attr.index = None
-        return self._get_tensor_size(attr)
-
-    def _remove_tensor(self, attr):
-        raise NotImplementedError("Removing features not supported")
-
-    def _infer_unspecified_attr(self, attr: CuGraphTensorAttr) -> CuGraphTensorAttr:
-        if attr.properties == _field_status.UNSET:
-            # attempt to infer property names
-            if attr.group_name in self._tensor_attr_dict:
-                for n in self._tensor_attr_dict[attr.group_name]:
-                    if attr.attr_name == n.attr_name:
-                        attr.properties = n.properties
-            else:
-                raise KeyError(f"Invalid group name {attr.group_name}")
-
-        if attr.dtype == _field_status.UNSET:
-            # attempt to infer dtype
-            if attr.group_name in self._tensor_attr_dict:
-                for n in self._tensor_attr_dict[attr.group_name]:
-                    if attr.attr_name == n.attr_name:
-                        attr.dtype = n.dtype
-
-        return attr
-
-    def filter(
-        self,
-        format: str,
-        node_dict: Dict[str, torch.Tensor],
-        row_dict: Dict[str, torch.Tensor],
-        col_dict: Dict[str, torch.Tensor],
-        edge_dict: Dict[str, Tuple[torch.Tensor]],
-    ) -> torch_geometric.data.HeteroData:
-        """
-        Parameters
-        ----------
-        format: str
-            COO or CSC
-        node_dict: Dict[str, torch.Tensor]
-            IDs of nodes in original store being outputted
-        row_dict: Dict[str, torch.Tensor]
-            Renumbered output edge index row
-        col_dict: Dict[str, torch.Tensor]
-            Renumbered output edge index column
-        edge_dict: Dict[str, Tuple[torch.Tensor]]
-            Currently unused original edge mapping
-        """
-        data = torch_geometric.data.HeteroData()
-
-        # TODO use torch_geometric.EdgeIndex in release 24.04 (Issue #4051)
-        for attr in self.get_all_edge_attrs():
-            key = attr.edge_type
-            if key in row_dict and key in col_dict:
-                if format == "CSC":
-                    data.put_edge_index(
-                        (row_dict[key], col_dict[key]),
-                        edge_type=key,
-                        layout="csc",
-                        is_sorted=True,
-                    )
-                else:
-                    data[key].edge_index = torch.stack(
-                        [
-                            row_dict[key],
-                            col_dict[key],
-                        ],
-                        dim=0,
-                    )
-
-        required_attrs = []
-        # To prevent copying multiple times, we use a cache;
-        # the original node_dict serves as the gpu cache if needed
-        node_dict_cpu = {}
-        for attr in self.get_all_tensor_attrs():
-            if attr.group_name in node_dict:
-                device = self.__features.get_storage(attr.group_name, attr.attr_name)
-                attr.index = node_dict[attr.group_name]
-                if not isinstance(attr.index, torch.Tensor):
-                    raise ValueError("Node index must be a tensor!")
-                if attr.index.is_cuda and device == "cpu":
-                    if attr.group_name not in node_dict_cpu:
-                        node_dict_cpu[attr.group_name] = attr.index.cpu()
-                    attr.index = node_dict_cpu[attr.group_name]
-                elif attr.index.is_cpu and device == "cuda":
-                    node_dict_cpu[attr.group_name] = attr.index
-                    node_dict[attr.group_name] = attr.index.cuda()
-                    attr.index = node_dict[attr.group_name]
-
-                required_attrs.append(attr)
-                data[attr.group_name].num_nodes = attr.index.size(0)
-
-        tensors = self.multi_get_tensor(required_attrs)
-        for i, attr in enumerate(required_attrs):
-            data[attr.group_name][attr.attr_name] = tensors[i]
-
-        return data
-
-    def __len__(self):
-        return len(self.get_all_tensor_attrs())
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
deleted file mode 100644
index b6450e7b192..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Optional, Tuple, List
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-tensordict = import_optional("tensordict")
-wgth = import_optional("pylibwholegraph.torch")
-
-
-class TensorDictFeatureStore(
-    object
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.data.FeatureStore
-):
-    """
-    A basic implementation of the PyG FeatureStore interface that stores
-    feature data in a single TensorDict.  This type of feature store is
-    not distributed, so each node will have to load the entire graph's
-    features into memory.
-    """
-
-    def __init__(self):
-        """
-        Constructs an empty TensorDictFeatureStore.
-        """
-        super().__init__()
-
-        self.__features = {}
-
-    def _put_tensor(
-        self,
-        tensor: "torch_geometric.typing.FeatureTensorType",
-        attr: "torch_geometric.data.feature_store.TensorAttr",
-    ) -> bool:
-        if attr.group_name in self.__features:
-            td = self.__features[attr.group_name]
-            batch_size = td.batch_size[0]
-
-            if attr.is_set("index"):
-                if attr.attr_name in td.keys():
-                    if attr.index.shape[0] != batch_size:
-                        raise ValueError(
-                            "Leading size of index tensor "
-                            "does not match existing tensors for group name "
-                            f"{attr.group_name}; Expected {batch_size}, "
-                            f"got {attr.index.shape[0]}"
-                        )
-                    td[attr.attr_name][attr.index] = tensor
-                    return True
-                else:
-                    warnings.warn(
-                        "Ignoring index parameter "
-                        f"(attribute does not exist for group {attr.group_name})"
-                    )
-
-            if tensor.shape[0] != batch_size:
-                raise ValueError(
-                    "Leading size of input tensor does not match "
-                    f"existing tensors for group name {attr.group_name};"
-                    f" Expected {batch_size}, got {tensor.shape[0]}"
-                )
-        else:
-            batch_size = tensor.shape[0]
-            self.__features[attr.group_name] = tensordict.TensorDict(
-                {}, batch_size=batch_size
-            )
-
-        self.__features[attr.group_name][attr.attr_name] = tensor
-        return True
-
-    def _get_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
-        if attr.group_name not in self.__features:
-            return None
-
-        if attr.attr_name not in self.__features[attr.group_name].keys():
-            return None
-
-        tensor = self.__features[attr.group_name][attr.attr_name]
-        return (
-            tensor
-            if (attr.index is None or (not attr.is_set("index")))
-            else tensor[attr.index]
-        )
-
-    def _remove_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> bool:
-        if attr.group_name not in self.__features:
-            return False
-
-        if attr.attr_name not in self.__features[attr.group_name].keys():
-            return False
-
-        del self.__features[attr.group_name][attr.attr_name]
-        return True
-
-    def _get_tensor_size(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Tuple:
-        return self._get_tensor(attr).size()
-
-    def get_all_tensor_attrs(
-        self,
-    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
-        attrs = []
-        for group_name, td in self.__features.items():
-            for attr_name in td.keys():
-                attrs.append(
-                    torch_geometric.data.feature_store.TensorAttr(
-                        group_name,
-                        attr_name,
-                    )
-                )
-
-        return attrs
-
-
-class WholeFeatureStore(
-    object
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.data.FeatureStore
-):
-    """
-    A basic implementation of the PyG FeatureStore interface that stores
-    feature data in WholeGraph WholeMemory.  This type of feature store is
-    distributed, and avoids data replication across workers.
-
-    Data should be sliced before being passed into this feature store.
-    That means each worker should have its own partition and put_tensor
-    should be called for each worker's local partition.  When calling
-    get_tensor, multi_get_tensor, etc., the entire tensor can be accessed
-    regardless of what worker's partition the desired slice of the tensor
-    is on.
-    """
-
-    def __init__(self, memory_type="distributed", location="cpu"):
-        """
-        Constructs an empty WholeFeatureStore.
-
-        Parameters
-        ----------
-        memory_type: str (optional, default='distributed')
-            The memory type of this store.  Options are
-            'distributed', 'chunked', and 'continuous'.
-            For more information consult the WholeGraph
-            documentation.
-        location: str(optional, default='cpu')
-            The location ('cpu' or 'cuda') where data is stored.
-        """
-        super().__init__()
-
-        self.__features = {}
-
-        self.__wg_comm = wgth.get_global_communicator()
-        self.__wg_type = memory_type
-        self.__wg_location = location
-
-    def _put_tensor(
-        self,
-        tensor: "torch_geometric.typing.FeatureTensorType",
-        attr: "torch_geometric.data.feature_store.TensorAttr",
-    ) -> bool:
-        wg_comm_obj = self.__wg_comm
-
-        if attr.is_set("index"):
-            if (attr.group_name, attr.attr_name) in self.__features:
-                raise NotImplementedError(
-                    "Updating an embedding from an index"
-                    " is not supported by WholeGraph."
-                )
-            else:
-                warnings.warn(
-                    "Ignoring index parameter "
-                    f"(attribute does not exist for group {attr.group_name})"
-                )
-
-        if len(tensor.shape) > 2:
-            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
-
-        rank = torch.distributed.get_rank()
-        world_size = torch.distributed.get_world_size()
-
-        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
-        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
-        torch.distributed.all_gather_into_tensor(sizes, ld)
-
-        sizes = sizes.cpu()
-        ld = sizes.sum()
-
-        td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
-        global_shape = [
-            int(ld),
-            td if td > 0 else 1,
-        ]
-
-        if td < 0:
-            tensor = tensor.reshape((tensor.shape[0], 1))
-
-        wg_embedding = wgth.create_wholememory_tensor(
-            wg_comm_obj,
-            self.__wg_type,
-            self.__wg_location,
-            global_shape,
-            tensor.dtype,
-            [global_shape[1], 1],
-        )
-
-        offset = sizes[:rank].sum() if rank > 0 else 0
-
-        wg_embedding.scatter(
-            tensor.clone(memory_format=torch.contiguous_format).cuda(),
-            torch.arange(
-                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
-            ).contiguous(),
-        )
-
-        wg_comm_obj.barrier()
-
-        self.__features[attr.group_name, attr.attr_name] = (wg_embedding, td)
-        return True
-
-    def _get_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
-        if (attr.group_name, attr.attr_name) not in self.__features:
-            return None
-
-        emb, td = self.__features[attr.group_name, attr.attr_name]
-
-        if attr.index is None or (not attr.is_set("index")):
-            attr.index = torch.arange(emb.shape[0], dtype=torch.int64)
-
-        attr.index = attr.index.cuda()
-        t = emb.gather(
-            attr.index,
-            force_dtype=emb.dtype,
-        )
-
-        if td < 0:
-            t = t.reshape((t.shape[0],))
-
-        return t
-
-    def _remove_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> bool:
-        if (attr.group_name, attr.attr_name) not in self.__features:
-            return False
-
-        del self.__features[attr.group_name, attr.attr_name]
-        return True
-
-    def _get_tensor_size(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Tuple:
-        return self.__features[attr.group_name, attr.attr_name].shape
-
-    def get_all_tensor_attrs(
-        self,
-    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
-        attrs = []
-        for (group_name, attr_name) in self.__features.keys():
-            attrs.append(
-                torch_geometric.data.feature_store.TensorAttr(
-                    group_name=group_name,
-                    attr_name=attr_name,
-                )
-            )
-
-        return attrs
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
deleted file mode 100644
index c47dda5eaa5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import cupy
-import cudf
-import pandas
-
-import pylibcugraph
-
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph.gnn.comms import cugraph_comms_get_raft_handle
-
-from typing import Union, Optional, List, Dict, Tuple
-
-
-# Have to use import_optional even though these are required
-# dependencies in order to build properly.
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-tensordict = import_optional("tensordict")
-
-TensorType = Union["torch.Tensor", cupy.ndarray, np.ndarray, cudf.Series, pandas.Series]
-
-
-class GraphStore(
-    object
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.data.GraphStore
-):
-    """
-    cuGraph-backed PyG GraphStore implementation that distributes
-    the graph across workers.  This object uses lazy graph creation.
-    Users can repeatedly call put_edge_index, and the tensors won't
-    be converted into a cuGraph graph until one is needed
-    (i.e. when creating a loader). Supports
-    single-node/single-GPU, single-node/multi-GPU, and
-    multi-node/multi-GPU graph storage.
-
-    Each worker should have a slice of the graph locally, and
-    call put_edge_index with its slice.
-    """
-
-    def __init__(self, is_multi_gpu: bool = False):
-        """
-        Constructs a new, empty GraphStore object.  This object
-        represents one slice of a graph on particular worker.
-        """
-        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
-        self.__sizes = {}
-
-        self.__handle = None
-        self.__is_multi_gpu = is_multi_gpu
-
-        self.__clear_graph()
-
-        super().__init__()
-
-    def __clear_graph(self):
-        self.__graph = None
-        self.__vertex_offsets = None
-        self.__weight_attr = None
-
-    def _put_edge_index(
-        self,
-        edge_index: "torch_geometric.typing.EdgeTensorType",
-        edge_attr: "torch_geometric.data.EdgeAttr",
-    ) -> bool:
-        if edge_attr.layout != torch_geometric.data.graph_store.EdgeLayout.COO:
-            raise ValueError("Only COO format supported")
-
-        if isinstance(edge_index, (cupy.ndarray, cudf.Series)):
-            edge_index = torch.as_tensor(edge_index, device="cuda")
-        elif isinstance(edge_index, (np.ndarray)):
-            edge_index = torch.as_tensor(edge_index, device="cpu")
-        elif isinstance(edge_index, pandas.Series):
-            edge_index = torch.as_tensor(edge_index.values, device="cpu")
-        elif isinstance(edge_index, cudf.Series):
-            edge_index = torch.as_tensor(edge_index.values, device="cuda")
-
-        self.__edge_indices[edge_attr.edge_type] = torch.stack(
-            [edge_index[0], edge_index[1]]
-        )
-        self.__sizes[edge_attr.edge_type] = edge_attr.size
-
-        # invalidate the graph
-        self.__clear_graph()
-        return True
-
-    def _get_edge_index(
-        self, edge_attr: "torch_geometric.data.EdgeAttr"
-    ) -> Optional["torch_geometric.typing.EdgeTensorType"]:
-        ei = torch_geometric.EdgeIndex(self.__edge_indices[edge_attr.edge_type])
-
-        if edge_attr.layout == "csr":
-            return ei.sort_by("row").values.get_csr()
-        elif edge_attr.layout == "csc":
-            return ei.sort_by("col").values.get_csc()
-
-        return ei
-
-    def _remove_edge_index(self, edge_attr: "torch_geometric.data.EdgeAttr") -> bool:
-        del self.__edge_indices[edge_attr.edge_type]
-
-        # invalidate the graph
-        self.__clear_graph()
-        return True
-
-    def get_all_edge_attrs(self) -> List["torch_geometric.data.EdgeAttr"]:
-        attrs = []
-        for et in self.__edge_indices.keys(leaves_only=True, include_nested=True):
-            attrs.append(
-                torch_geometric.data.EdgeAttr(
-                    edge_type=et, layout="coo", is_sorted=False, size=self.__sizes[et]
-                )
-            )
-
-        return attrs
-
-    @property
-    def is_multi_gpu(self):
-        return self.__is_multi_gpu
-
-    @property
-    def _resource_handle(self):
-        if self.__handle is None:
-            if self.is_multi_gpu:
-                self.__handle = pylibcugraph.ResourceHandle(
-                    cugraph_comms_get_raft_handle().getHandle()
-                )
-            else:
-                self.__handle = pylibcugraph.ResourceHandle()
-        return self.__handle
-
-    @property
-    def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
-        graph_properties = pylibcugraph.GraphProperties(
-            is_multigraph=True, is_symmetric=False
-        )
-
-        if self.__graph is None:
-            edgelist_dict = self.__get_edgelist()
-
-            if self.is_multi_gpu:
-                rank = torch.distributed.get_rank()
-                world_size = torch.distributed.get_world_size()
-
-                vertices_array = cupy.arange(
-                    sum(self._num_vertices().values()), dtype="int64"
-                )
-                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
-
-                self.__graph = pylibcugraph.MGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    [cupy.asarray(edgelist_dict["src"]).astype("int64")],
-                    [cupy.asarray(edgelist_dict["dst"]).astype("int64")],
-                    vertices_array=[vertices_array],
-                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
-                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
-                    weight_array=[cupy.asarray(edgelist_dict["wgt"])]
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-            else:
-                self.__graph = pylibcugraph.SGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    cupy.asarray(edgelist_dict["src"]).astype("int64"),
-                    cupy.asarray(edgelist_dict["dst"]).astype("int64"),
-                    vertices_array=cupy.arange(
-                        sum(self._num_vertices().values()), dtype="int64"
-                    ),
-                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
-                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
-                    weight_array=cupy.asarray(edgelist_dict["wgt"])
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-
-        return self.__graph
-
-    def _num_vertices(self) -> Dict[str, int]:
-        num_vertices = {}
-        for edge_attr in self.get_all_edge_attrs():
-            if edge_attr.size is not None:
-                num_vertices[edge_attr.edge_type[0]] = (
-                    max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0])
-                    if edge_attr.edge_type[0] in num_vertices
-                    else edge_attr.size[0]
-                )
-                num_vertices[edge_attr.edge_type[2]] = (
-                    max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1])
-                    if edge_attr.edge_type[2] in num_vertices
-                    else edge_attr.size[1]
-                )
-            else:
-                if edge_attr.edge_type[0] != edge_attr.edge_type[2]:
-                    if edge_attr.edge_type[0] not in num_vertices:
-                        num_vertices[edge_attr.edge_type[0]] = int(
-                            self.__edge_indices[edge_attr.edge_type][0].max() + 1
-                        )
-                    if edge_attr.edge_type[2] not in num_vertices:
-                        num_vertices[edge_attr.edge_type[1]] = int(
-                            self.__edge_indices[edge_attr.edge_type][1].max() + 1
-                        )
-                elif edge_attr.edge_type[0] not in num_vertices:
-                    num_vertices[edge_attr.edge_type[0]] = int(
-                        self.__edge_indices[edge_attr.edge_type].max() + 1
-                    )
-
-        if self.is_multi_gpu:
-            vtypes = num_vertices.keys()
-            for vtype in vtypes:
-                sz = torch.tensor(num_vertices[vtype], device="cuda")
-                torch.distributed.all_reduce(sz, op=torch.distributed.ReduceOp.MAX)
-                num_vertices[vtype] = int(sz)
-        return num_vertices
-
-    @property
-    def _vertex_offsets(self) -> Dict[str, int]:
-        if self.__vertex_offsets is None:
-            num_vertices = self._num_vertices()
-            ordered_keys = sorted(list(num_vertices.keys()))
-            self.__vertex_offsets = {}
-            offset = 0
-            for vtype in ordered_keys:
-                self.__vertex_offsets[vtype] = offset
-                offset += num_vertices[vtype]
-
-        return dict(self.__vertex_offsets)
-
-    @property
-    def is_homogeneous(self) -> bool:
-        return len(self._vertex_offsets) == 1
-
-    def _set_weight_attr(self, attr: Tuple["torch_geometric.data.FeatureStore", str]):
-        if attr != self.__weight_attr:
-            self.__clear_graph()
-            self.__weight_attr = attr
-
-    def __get_weight_tensor(
-        self,
-        sorted_keys: List[Tuple[str, str, str]],
-        start_offsets: "torch.Tensor",
-        num_edges_t: "torch.Tensor",
-    ):
-        feature_store, attr_name = self.__weight_attr
-
-        weights = []
-        for i, et in enumerate(sorted_keys):
-            ix = torch.arange(
-                start_offsets[i],
-                start_offsets[i] + num_edges_t[i],
-                dtype=torch.int64,
-                device="cpu",
-            )
-
-            weights.append(feature_store[et, attr_name][ix])
-
-        return torch.concat(weights)
-
-    def __get_edgelist(self):
-        """
-        Returns
-        -------
-        Dict[str, torch.Tensor] with the following keys:
-            src: source vertices (int64)
-                Note that src is the 2nd element of the PyG edge index.
-            dst: destination vertices (int64)
-                Note that dst is the 1st element of the PyG edge index.
-            eid: edge ids for each edge (int64)
-                Note that these start from 0 for each edge type.
-            etp: edge types for each edge (int32)
-                Note that these are in lexicographic order.
-        """
-        sorted_keys = sorted(
-            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
-        )
-
-        # note that this still follows the PyG convention of (dst, rel, src)
-        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
-        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
-        # and (paper 1) -> (author 0)
-        edge_index = torch.concat(
-            [
-                torch.stack(
-                    [
-                        self.__edge_indices[dst_type, rel_type, src_type][0]
-                        + self._vertex_offsets[dst_type],
-                        self.__edge_indices[dst_type, rel_type, src_type][1]
-                        + self._vertex_offsets[src_type],
-                    ]
-                )
-                for (dst_type, rel_type, src_type) in sorted_keys
-            ],
-            axis=1,
-        ).cuda()
-
-        edge_type_array = torch.arange(
-            len(sorted_keys), dtype=torch.int32, device="cuda"
-        ).repeat_interleave(
-            torch.tensor(
-                [self.__edge_indices[et].shape[1] for et in sorted_keys],
-                device="cuda",
-                dtype=torch.int64,
-            )
-        )
-
-        num_edges_t = torch.tensor(
-            [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
-        )
-
-        if self.is_multi_gpu:
-            rank = torch.distributed.get_rank()
-            world_size = torch.distributed.get_world_size()
-
-            num_edges_all_t = torch.empty(
-                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
-            )
-            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
-
-            start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
-        else:
-            rank = 0
-            start_offsets = torch.zeros(
-                (len(sorted_keys),), dtype=torch.int64, device="cuda"
-            )
-            num_edges_all_t = num_edges_t.reshape((1, num_edges_t.numel()))
-
-        edge_id_array = torch.concat(
-            [
-                torch.arange(
-                    start_offsets[i],
-                    start_offsets[i] + num_edges_all_t[rank][i],
-                    dtype=torch.int64,
-                    device="cuda",
-                )
-                for i in range(len(sorted_keys))
-            ]
-        )
-
-        d = {
-            "dst": edge_index[0],
-            "src": edge_index[1],
-            "etp": edge_type_array,
-            "eid": edge_id_array,
-        }
-
-        if self.__weight_attr is not None:
-            d["wgt"] = self.__get_weight_tensor(
-                sorted_keys, start_offsets.cpu(), num_edges_t.cpu()
-            ).cuda()
-
-        return d
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
deleted file mode 100644
index 31cbaf69ca5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph nccl-only comms, pylibcuGraph,
-# and PyTorch DDP to run a multi-GPU sampling workflow.  Most users of the
-# GNN packages will not interact with cuGraph directly.  This example
-# is intented for users who want to extend cuGraph within a DDP workflow.
-
-import os
-import re
-import tempfile
-
-import numpy as np
-import torch
-import torch.multiprocessing as tmp
-import torch.distributed as dist
-
-import cudf
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_get_raft_handle,
-    DistSampleWriter,
-    UniformNeighborSampler,
-)
-
-from pylibcugraph import MGGraph, ResourceHandle, GraphProperties
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def init_pytorch(rank, world_size):
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-
-def sample(rank: int, world_size: int, uid, edgelist, directory):
-    init_pytorch(rank, world_size)
-
-    device = rank
-    cugraph_comms_init(rank, world_size, uid, device)
-
-    print(f"rank {rank} initialized cugraph")
-
-    src = cudf.Series(np.array_split(edgelist[0], world_size)[rank])
-    dst = cudf.Series(np.array_split(edgelist[1], world_size)[rank])
-
-    seeds_per_rank = 50
-    seeds = cudf.Series(np.arange(rank * seeds_per_rank, (rank + 1) * seeds_per_rank))
-    handle = ResourceHandle(cugraph_comms_get_raft_handle().getHandle())
-
-    print("constructing graph")
-    G = MGGraph(
-        handle,
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        [src],
-        [dst],
-    )
-    print("graph constructed")
-
-    sample_writer = DistSampleWriter(directory=directory, batches_per_partition=2)
-    sampler = UniformNeighborSampler(
-        G,
-        sample_writer,
-        fanout=[5, 5],
-    )
-
-    sampler.sample_from_nodes(seeds, batch_size=16, random_state=62)
-
-    dist.barrier()
-    cugraph_comms_shutdown()
-    print(f"rank {rank} shut down cugraph")
-
-
-def main():
-    world_size = torch.cuda.device_count()
-    uid = cugraph_comms_create_unique_id()
-
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-
-    with tempfile.TemporaryDirectory() as directory:
-        tmp.spawn(
-            sample,
-            args=(world_size, uid, el, directory),
-            nprocs=world_size,
-        )
-
-        print("Printing samples...")
-        for file in os.listdir(directory):
-            m = re.match(r"batch=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet", file)
-            rank, start, _, end = int(m[1]), int(m[2]), int(m[3]), int(m[4])
-            print(f"File: {file} (batches {start} to {end} for rank {rank})")
-            print(cudf.read_parquet(os.path.join(directory, file)))
-            print("\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
deleted file mode 100644
index de45acc7456..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph nccl-only comms, pylibcuGraph,
-# and PyTorch to run a single-GPU sampling workflow.  Most users of the
-# GNN packages will not interact with cuGraph directly.  This example
-# is intented for users who want to extend cuGraph within a PyTorch workflow.
-
-import os
-import re
-import tempfile
-
-import numpy as np
-
-import cudf
-
-from cugraph.gnn import (
-    DistSampleWriter,
-    UniformNeighborSampler,
-)
-
-from pylibcugraph import SGGraph, ResourceHandle, GraphProperties
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def sample(edgelist, directory):
-    src = cudf.Series(edgelist[0])
-    dst = cudf.Series(edgelist[1])
-
-    seeds_per_rank = 50
-    seeds = cudf.Series(np.arange(0, seeds_per_rank))
-
-    print("constructing graph")
-    G = SGGraph(
-        ResourceHandle(),
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        src,
-        dst,
-    )
-    print("graph constructed")
-
-    sample_writer = DistSampleWriter(directory=directory, batches_per_partition=2)
-    sampler = UniformNeighborSampler(
-        G,
-        sample_writer,
-        fanout=[5, 5],
-        compression="CSR",
-        retain_original_seeds=True,
-    )
-
-    sampler.sample_from_nodes(seeds, batch_size=16, random_state=62)
-
-
-def main():
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-
-    with tempfile.TemporaryDirectory() as directory:
-        sample(el, directory)
-
-        print("Printing samples...")
-        for file in os.listdir(directory):
-            m = re.match(r"batch=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet", file)
-            rank, start, _, end = int(m[1]), int(m[2]), int(m[3]), int(m[4])
-            print(f"File: {file} (batches {start} to {end} for rank {rank})")
-            print(cudf.read_parquet(os.path.join(directory, file)))
-            print("\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
deleted file mode 100644
index 127ca809d91..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Multi-node, multi-GPU example with WholeGraph feature storage.
-# Can be run with torchrun.
-
-import argparse
-import os
-import warnings
-import tempfile
-import time
-import json
-
-
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from ogb.nodeproppred import PygNodePropPredDataset
-from torch.nn.parallel import DistributedDataParallel
-
-import torch_geometric
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=local_rank,
-        managed_memory=True,
-        pool_allocator=True,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(local_rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(local_rank)
-
-    cugraph_comms_init(
-        rank=global_rank, world_size=world_size, uid=cugraph_id, device=local_rank
-    )
-
-    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
-
-
-def partition_data(dataset, split_idx, edge_path, feature_path, label_path, meta_path):
-    data = dataset[0]
-
-    # Split and save edge index
-    os.makedirs(
-        edge_path,
-        exist_ok=True,
-    )
-    for (r, e) in enumerate(torch.tensor_split(data.edge_index, world_size, dim=1)):
-        rank_path = os.path.join(edge_path, f"rank={r}.pt")
-        torch.save(
-            e.clone(),
-            rank_path,
-        )
-
-    # Split and save features
-    os.makedirs(
-        feature_path,
-        exist_ok=True,
-    )
-
-    for (r, f) in enumerate(torch.tensor_split(data.x, world_size)):
-        rank_path = os.path.join(feature_path, f"rank={r}_x.pt")
-        torch.save(
-            f.clone(),
-            rank_path,
-        )
-    for (r, f) in enumerate(torch.tensor_split(data.y, world_size)):
-        rank_path = os.path.join(feature_path, f"rank={r}_y.pt")
-        torch.save(
-            f.clone(),
-            rank_path,
-        )
-
-    # Split and save labels
-    os.makedirs(
-        label_path,
-        exist_ok=True,
-    )
-    for (d, i) in split_idx.items():
-        i_parts = torch.tensor_split(i, world_size)
-        for r, i_part in enumerate(i_parts):
-            rank_path = os.path.join(label_path, f"rank={r}")
-            os.makedirs(rank_path, exist_ok=True)
-            torch.save(i_part, os.path.join(rank_path, f"{d}.pt"))
-
-    # Save metadata
-    meta = {
-        "num_classes": int(dataset.num_classes),
-        "num_features": int(dataset.num_features),
-        "num_nodes": int(data.num_nodes),
-    }
-    with open(meta_path, "w") as f:
-        json.dump(meta, f)
-
-
-def load_partitioned_data(
-    rank, edge_path, feature_path, label_path, meta_path, wg_mem_type
-):
-    from cugraph_pyg.data import GraphStore, WholeFeatureStore
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = WholeFeatureStore(memory_type=wg_mem_type)
-
-    # Load metadata
-    with open(meta_path, "r") as f:
-        meta = json.load(f)
-
-    # Load labels
-    split_idx = {}
-    for split in ["train", "test", "valid"]:
-        split_idx[split] = torch.load(
-            os.path.join(label_path, f"rank={rank}", f"{split}.pt")
-        )
-
-    # Load features
-    feature_store["node", "x"] = torch.load(
-        os.path.join(feature_path, f"rank={rank}_x.pt")
-    )
-    feature_store["node", "y"] = torch.load(
-        os.path.join(feature_path, f"rank={rank}_y.pt")
-    )
-
-    # Load edge index
-    eix = torch.load(os.path.join(edge_path, f"rank={rank}.pt"))
-    graph_store[
-        ("node", "rel", "node"), "coo", False, (meta["num_nodes"], meta["num_nodes"])
-    ] = eix
-
-    return (feature_store, graph_store), split_idx, meta
-
-
-def run_train(
-    global_rank,
-    data,
-    split_idx,
-    world_size,
-    device,
-    model,
-    epochs,
-    batch_size,
-    fan_out,
-    num_classes,
-    wall_clock_start,
-    tempdir=None,
-    num_layers=3,
-    in_memory=False,
-    seeds_per_call=-1,
-):
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
-
-    kwargs = dict(
-        num_neighbors=[fan_out] * num_layers,
-        batch_size=batch_size,
-    )
-    # Set Up Neighbor Loading
-    from cugraph_pyg.loader import NeighborLoader
-
-    ix_train = split_idx["train"].cuda()
-    train_path = None if in_memory else os.path.join(tempdir, f"train_{global_rank}")
-    if train_path:
-        os.mkdir(train_path)
-    train_loader = NeighborLoader(
-        data,
-        input_nodes=ix_train,
-        directory=train_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    ix_test = split_idx["test"].cuda()
-    test_path = None if in_memory else os.path.join(tempdir, f"test_{global_rank}")
-    if test_path:
-        os.mkdir(test_path)
-    test_loader = NeighborLoader(
-        data,
-        input_nodes=ix_test,
-        directory=test_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=80000,
-        **kwargs,
-    )
-
-    ix_valid = split_idx["valid"].cuda()
-    valid_path = None if in_memory else os.path.join(tempdir, f"valid_{global_rank}")
-    if valid_path:
-        os.mkdir(valid_path)
-    valid_loader = NeighborLoader(
-        data,
-        input_nodes=ix_valid,
-        directory=valid_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    dist.barrier()
-
-    eval_steps = 1000
-    warmup_steps = 20
-    dist.barrier()
-    torch.cuda.synchronize()
-
-    if global_rank == 0:
-        prep_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total time before training begins (prep_time) =", prep_time, "seconds")
-        print("Beginning training...")
-
-    for epoch in range(epochs):
-        for i, batch in enumerate(train_loader):
-            if i == warmup_steps:
-                torch.cuda.synchronize()
-                start = time.time()
-
-            batch = batch.to(device)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.view(-1).to(torch.long)
-            optimizer.zero_grad()
-            out = model(batch.x, batch.edge_index)
-            loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size])
-            loss.backward()
-            optimizer.step()
-            if global_rank == 0 and i % 10 == 0:
-                print(
-                    "Epoch: "
-                    + str(epoch)
-                    + ", Iteration: "
-                    + str(i)
-                    + ", Loss: "
-                    + str(loss)
-                )
-        nb = i + 1.0
-
-        if global_rank == 0:
-            print(
-                "Average Training Iteration Time:",
-                (time.time() - start) / (nb - warmup_steps),
-                "s/iter",
-            )
-
-        with torch.no_grad():
-            total_correct = total_examples = 0
-            for i, batch in enumerate(valid_loader):
-                if i >= eval_steps:
-                    break
-
-                batch = batch.to(device)
-                batch_size = batch.batch_size
-
-                batch.y = batch.y.to(torch.long)
-                out = model(batch.x, batch.edge_index)[:batch_size]
-
-                pred = out.argmax(dim=-1)
-                y = batch.y[:batch_size].view(-1).to(torch.long)
-
-                total_correct += int((pred == y).sum())
-                total_examples += y.size(0)
-
-            acc_val = total_correct / total_examples
-            if global_rank == 0:
-                print(
-                    f"Validation Accuracy: {acc_val * 100.0:.4f}%",
-                )
-
-        torch.cuda.synchronize()
-
-    with torch.no_grad():
-        total_correct = total_examples = 0
-        for i, batch in enumerate(test_loader):
-            batch = batch.to(device)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.to(torch.long)
-            out = model(batch.x, batch.edge_index)[:batch_size]
-
-            pred = out.argmax(dim=-1)
-            y = batch.y[:batch_size].view(-1).to(torch.long)
-
-            total_correct += int((pred == y).sum())
-            total_examples += y.size(0)
-
-        acc_test = total_correct / total_examples
-        if global_rank == 0:
-            print(
-                f"Test Accuracy: {acc_test * 100.0:.4f}%",
-            )
-
-    if global_rank == 0:
-        total_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total Program Runtime (total_time) =", total_time, "seconds")
-        print("total_time - prep_time =", total_time - prep_time, "seconds")
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=256)
-    parser.add_argument("--num_layers", type=int, default=2)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=1024)
-    parser.add_argument("--fan_out", type=int, default=30)
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--dataset", type=str, default="ogbn-products")
-    parser.add_argument("--skip_partition", action="store_true")
-    parser.add_argument("--wg_mem_type", type=str, default="distributed")
-
-    parser.add_argument("--in_memory", action="store_true", default=False)
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    wall_clock_start = time.perf_counter()
-
-    if "LOCAL_RANK" in os.environ:
-        dist.init_process_group("nccl")
-        world_size = dist.get_world_size()
-        global_rank = dist.get_rank()
-        local_rank = int(os.environ["LOCAL_RANK"])
-        device = torch.device(local_rank)
-
-        # Create the uid needed for cuGraph comms
-        if global_rank == 0:
-            cugraph_id = [cugraph_comms_create_unique_id()]
-        else:
-            cugraph_id = [None]
-        dist.broadcast_object_list(cugraph_id, src=0, device=device)
-        cugraph_id = cugraph_id[0]
-
-        init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id)
-
-        # Split the data
-        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
-        feature_path = os.path.join(args.dataset_root, args.dataset + "_fea_part")
-        label_path = os.path.join(args.dataset_root, args.dataset + "_label_part")
-        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
-
-        # We partition the data to avoid loading it in every worker, which will
-        # waste memory and can lead to an out of memory exception.
-        # cugraph_pyg.GraphStore and cugraph_pyg.WholeFeatureStore are always
-        # constructed from partitions of the edge index and features, respectively,
-        # so this works well.
-        if not args.skip_partition and global_rank == 0:
-            dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
-            split_idx = dataset.get_idx_split()
-
-            partition_data(
-                dataset,
-                split_idx,
-                meta_path=meta_path,
-                label_path=label_path,
-                feature_path=feature_path,
-                edge_path=edge_path,
-            )
-
-        dist.barrier()
-        data, split_idx, meta = load_partitioned_data(
-            rank=global_rank,
-            edge_path=edge_path,
-            feature_path=feature_path,
-            label_path=label_path,
-            meta_path=meta_path,
-            wg_mem_type=args.wg_mem_type,
-        )
-        dist.barrier()
-
-        model = torch_geometric.nn.models.GCN(
-            meta["num_features"],
-            args.hidden_channels,
-            args.num_layers,
-            meta["num_classes"],
-        ).to(device)
-        model = DistributedDataParallel(model, device_ids=[local_rank])
-
-        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
-            run_train(
-                global_rank,
-                data,
-                split_idx,
-                world_size,
-                device,
-                model,
-                args.epochs,
-                args.batch_size,
-                args.fan_out,
-                meta["num_classes"],
-                wall_clock_start,
-                tempdir,
-                args.num_layers,
-                args.in_memory,
-                args.seeds_per_call,
-            )
-    else:
-        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
deleted file mode 100644
index 0f9c39bf04d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import argparse
-import tempfile
-import os
-import warnings
-
-from typing import Optional, Tuple, Dict
-
-import torch
-import cupy
-
-import rmm
-from rmm.allocators.cupy import rmm_cupy_allocator
-from rmm.allocators.torch import rmm_torch_allocator
-
-# Must change allocators immediately upon import
-# or else other imports will cause memory to be
-# allocated and prevent changing the allocator
-rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
-cupy.cuda.set_allocator(rmm_cupy_allocator)
-torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-import torch.nn.functional as F  # noqa: E402
-import torch_geometric  # noqa: E402
-import cugraph_pyg  # noqa: E402
-from cugraph_pyg.loader import NeighborLoader  # noqa: E402
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling  # noqa: E402
-
-enable_spilling()
-
-
-def train(epoch: int):
-    model.train()
-    for i, batch in enumerate(train_loader):
-        if i == warmup_steps:
-            torch.cuda.synchronize()
-            start_avg_time = time.perf_counter()
-        batch = batch.to(device)
-
-        optimizer.zero_grad()
-        batch_size = batch.batch_size
-        out = model(batch.x, batch.edge_index)[:batch_size]
-        y = batch.y[:batch_size].view(-1).to(torch.long)
-
-        loss = F.cross_entropy(out, y)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(f"Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}")
-    torch.cuda.synchronize()
-    print(
-        f"Average Training Iteration Time (s/iter): \
-            {(time.perf_counter() - start_avg_time) / (i - warmup_steps):.6f}"
-    )
-
-
-@torch.no_grad()
-def test(loader: NeighborLoader, val_steps: Optional[int] = None):
-    model.eval()
-
-    total_correct = total_examples = 0
-    for i, batch in enumerate(loader):
-        if val_steps is not None and i >= val_steps:
-            break
-        batch = batch.to(device)
-        batch_size = batch.batch_size
-        out = model(batch.x, batch.edge_index)[:batch_size]
-        pred = out.argmax(dim=-1)
-        y = batch.y[:batch_size].view(-1).to(torch.long)
-
-        total_correct += int((pred == y).sum())
-        total_examples += y.size(0)
-
-    return total_correct / total_examples
-
-
-def create_loader(
-    data,
-    num_neighbors,
-    input_nodes,
-    replace,
-    batch_size,
-    samples_dir,
-    stage_name,
-    local_seeds_per_call,
-):
-    if samples_dir is not None:
-        directory = os.path.join(samples_dir, stage_name)
-        os.mkdir(directory)
-    else:
-        directory = None
-    return NeighborLoader(
-        data,
-        num_neighbors=num_neighbors,
-        input_nodes=input_nodes,
-        replace=replace,
-        batch_size=batch_size,
-        directory=directory,
-        local_seeds_per_call=local_seeds_per_call,
-    )
-
-
-def load_data(
-    dataset, dataset_root
-) -> Tuple[
-    Tuple[torch_geometric.data.FeatureStore, torch_geometric.data.GraphStore],
-    Dict[str, torch.Tensor],
-    int,
-    int,
-]:
-    from ogb.nodeproppred import PygNodePropPredDataset
-
-    dataset = PygNodePropPredDataset(dataset, root=dataset_root)
-    split_idx = dataset.get_idx_split()
-    data = dataset[0]
-
-    graph_store = cugraph_pyg.data.GraphStore()
-    graph_store[
-        ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
-    ] = data.edge_index
-
-    feature_store = cugraph_pyg.data.TensorDictFeatureStore()
-    feature_store["node", "x"] = data.x
-    feature_store["node", "y"] = data.y
-
-    return (
-        (feature_store, graph_store),
-        split_idx,
-        dataset.num_features,
-        dataset.num_classes,
-    )
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=256)
-    parser.add_argument("--num_layers", type=int, default=2)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=1024)
-    parser.add_argument("--fan_out", type=int, default=30)
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--dataset", type=str, default="ogbn-products")
-    parser.add_argument("--in_memory", action="store_true", default=False)
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    wall_clock_start = time.perf_counter()
-    device = torch.device("cuda")
-
-    data, split_idx, num_features, num_classes = load_data(
-        args.dataset, args.dataset_root
-    )
-
-    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
-        warnings.warn("Pruning test dataset for CI run.")
-        split_idx["test"] = split_idx["test"][:1000]
-
-    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as samples_dir:
-        loader_kwargs = {
-            "data": data,
-            "num_neighbors": [args.fan_out] * args.num_layers,
-            "replace": False,
-            "batch_size": args.batch_size,
-            "samples_dir": None if args.in_memory else samples_dir,
-            "local_seeds_per_call": None
-            if args.seeds_per_call <= 0
-            else args.seeds_per_call,
-        }
-
-        train_loader = create_loader(
-            input_nodes=split_idx["train"],
-            stage_name="train",
-            **loader_kwargs,
-        )
-
-        val_loader = create_loader(
-            input_nodes=split_idx["valid"],
-            stage_name="val",
-            **loader_kwargs,
-        )
-
-        test_loader = create_loader(
-            input_nodes=split_idx["test"],
-            stage_name="test",
-            **loader_kwargs,
-        )
-
-        model = torch_geometric.nn.models.GCN(
-            num_features,
-            args.hidden_channels,
-            args.num_layers,
-            num_classes,
-        ).to(device)
-
-        optimizer = torch.optim.Adam(
-            model.parameters(), lr=args.lr, weight_decay=0.0005
-        )
-
-        warmup_steps = 20
-
-        torch.cuda.synchronize()
-        prep_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total time before training begins (prep_time)=", prep_time, "seconds")
-        print("Beginning training...")
-        for epoch in range(1, 1 + args.epochs):
-            train(epoch)
-            val_acc = test(val_loader, val_steps=100)
-            print(f"Val Acc: ~{val_acc:.4f}")
-
-        test_acc = test(test_loader)
-        print(f"Test Acc: {test_acc:.4f}")
-        total_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total Program Runtime (total_time) =", total_time, "seconds")
-        print("total_time - prep_time =", total_time - prep_time, "seconds")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
deleted file mode 100644
index 73efbc92a24..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Single-node, multi-GPU example.
-
-import argparse
-import os
-import tempfile
-import time
-import warnings
-
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-import torch.nn.functional as F
-from ogb.nodeproppred import PygNodePropPredDataset
-from torch.nn.parallel import DistributedDataParallel
-
-import torch_geometric
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-        managed_memory=True,
-        pool_allocator=True,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-
-
-def run_train(
-    rank,
-    data,
-    world_size,
-    cugraph_id,
-    model,
-    epochs,
-    batch_size,
-    fan_out,
-    split_idx,
-    num_classes,
-    wall_clock_start,
-    tempdir=None,
-    num_layers=3,
-    in_memory=False,
-    seeds_per_call=-1,
-):
-
-    init_pytorch_worker(
-        rank,
-        world_size,
-        cugraph_id,
-    )
-
-    model = model.to(rank)
-    model = DistributedDataParallel(model, device_ids=[rank])
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
-
-    kwargs = dict(
-        num_neighbors=[fan_out] * num_layers,
-        batch_size=batch_size,
-    )
-    # Set Up Neighbor Loading
-    from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
-    from cugraph_pyg.loader import NeighborLoader
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    ixr = torch.tensor_split(data.edge_index, world_size, dim=1)[rank]
-    graph_store[
-        ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
-    ] = ixr
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["node", "x"] = data.x
-    feature_store["node", "y"] = data.y
-
-    dist.barrier()
-
-    ix_train = torch.tensor_split(split_idx["train"], world_size)[rank].cuda()
-    train_path = None if in_memory else os.path.join(tempdir, f"train_{rank}")
-    if train_path:
-        os.mkdir(train_path)
-    train_loader = NeighborLoader(
-        (feature_store, graph_store),
-        input_nodes=ix_train,
-        directory=train_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    ix_test = torch.tensor_split(split_idx["test"], world_size)[rank].cuda()
-    test_path = None if in_memory else os.path.join(tempdir, f"test_{rank}")
-    if test_path:
-        os.mkdir(test_path)
-    test_loader = NeighborLoader(
-        (feature_store, graph_store),
-        input_nodes=ix_test,
-        directory=test_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=80000,
-        **kwargs,
-    )
-
-    ix_valid = torch.tensor_split(split_idx["valid"], world_size)[rank].cuda()
-    valid_path = None if in_memory else os.path.join(tempdir, f"valid_{rank}")
-    if valid_path:
-        os.mkdir(valid_path)
-    valid_loader = NeighborLoader(
-        (feature_store, graph_store),
-        input_nodes=ix_valid,
-        directory=valid_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    dist.barrier()
-
-    eval_steps = 1000
-    warmup_steps = 20
-    dist.barrier()
-    torch.cuda.synchronize()
-
-    if rank == 0:
-        prep_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total time before training begins (prep_time) =", prep_time, "seconds")
-        print("Beginning training...")
-    for epoch in range(epochs):
-        for i, batch in enumerate(train_loader):
-            if i == warmup_steps:
-                torch.cuda.synchronize()
-                start = time.time()
-
-            batch = batch.to(rank)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.to(torch.long)
-            optimizer.zero_grad()
-            out = model(batch.x, batch.edge_index)
-            loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size])
-            loss.backward()
-            optimizer.step()
-            if rank == 0 and i % 10 == 0:
-                print(
-                    "Epoch: "
-                    + str(epoch)
-                    + ", Iteration: "
-                    + str(i)
-                    + ", Loss: "
-                    + str(loss)
-                )
-        nb = i + 1.0
-
-        if rank == 0:
-            print(
-                "Average Training Iteration Time:",
-                (time.time() - start) / (nb - warmup_steps),
-                "s/iter",
-            )
-
-        with torch.no_grad():
-            total_correct = total_examples = 0
-            for i, batch in enumerate(valid_loader):
-                if i >= eval_steps:
-                    break
-
-                batch = batch.to(rank)
-                batch_size = batch.batch_size
-
-                batch.y = batch.y.to(torch.long)
-                out = model(batch.x, batch.edge_index)[:batch_size]
-
-                pred = out.argmax(dim=-1)
-                y = batch.y[:batch_size].view(-1).to(torch.long)
-
-                total_correct += int((pred == y).sum())
-                total_examples += y.size(0)
-
-            acc_val = total_correct / total_examples
-            if rank == 0:
-                print(
-                    f"Validation Accuracy: {acc_val * 100.0:.4f}%",
-                )
-
-        torch.cuda.synchronize()
-
-    with torch.no_grad():
-        total_correct = total_examples = 0
-        for i, batch in enumerate(test_loader):
-            batch = batch.to(rank)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.to(torch.long)
-            out = model(batch.x, batch.edge_index)[:batch_size]
-
-            pred = out.argmax(dim=-1)
-            y = batch.y[:batch_size].view(-1).to(torch.long)
-
-            total_correct += int((pred == y).sum())
-            total_examples += y.size(0)
-
-        acc_test = total_correct / total_examples
-        if rank == 0:
-            print(
-                f"Test Accuracy: {acc_test * 100.0:.4f}%",
-            )
-
-    if rank == 0:
-        total_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total Program Runtime (total_time) =", total_time, "seconds")
-        print("total_time - prep_time =", total_time - prep_time, "seconds")
-
-    cugraph_comms_shutdown()
-    dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
-        warnings.warn("Skipping SMNG example in CI due to memory limit")
-    else:
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--hidden_channels", type=int, default=256)
-        parser.add_argument("--num_layers", type=int, default=2)
-        parser.add_argument("--lr", type=float, default=0.001)
-        parser.add_argument("--epochs", type=int, default=4)
-        parser.add_argument("--batch_size", type=int, default=1024)
-        parser.add_argument("--fan_out", type=int, default=30)
-        parser.add_argument("--tempdir_root", type=str, default=None)
-        parser.add_argument("--dataset_root", type=str, default="dataset")
-        parser.add_argument("--dataset", type=str, default="ogbn-products")
-        parser.add_argument("--in_memory", action="store_true", default=False)
-        parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-        parser.add_argument(
-            "--n_devices",
-            type=int,
-            default=-1,
-            help="1-8 to use that many GPUs. Defaults to all available GPUs",
-        )
-
-        args = parser.parse_args()
-        wall_clock_start = time.perf_counter()
-
-        from rmm.allocators.torch import rmm_torch_allocator
-
-        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-        dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
-        split_idx = dataset.get_idx_split()
-        data = dataset[0]
-        data.y = data.y.reshape(-1)
-
-        model = torch_geometric.nn.models.GCN(
-            dataset.num_features,
-            args.hidden_channels,
-            args.num_layers,
-            dataset.num_classes,
-        )
-
-        print("Data =", data)
-        if args.n_devices == -1:
-            world_size = torch.cuda.device_count()
-        else:
-            world_size = args.n_devices
-        print("Using", world_size, "GPUs...")
-
-        # Create the uid needed for cuGraph comms
-        cugraph_id = cugraph_comms_create_unique_id()
-
-        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
-            mp.spawn(
-                run_train,
-                args=(
-                    data,
-                    world_size,
-                    cugraph_id,
-                    model,
-                    args.epochs,
-                    args.batch_size,
-                    args.fan_out,
-                    split_idx,
-                    dataset.num_classes,
-                    wall_clock_start,
-                    tempdir,
-                    args.num_layers,
-                    args.in_memory,
-                    args.seeds_per_call,
-                ),
-                nprocs=world_size,
-                join=True,
-            )
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
deleted file mode 100644
index 145675c8a06..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# For this script, dask must be started first in a separate process.
-# To do this, the `start_dask.sh` script has been provided.  This scripts starts
-# a dask scheduler and dask workers.  To select the GPUs and amount of memory
-# allocated to dask per GPU, the `CUDA_VISIBLE_DEVICES` and `WORKER_RMM_POOL_SIZE`
-# arguments in that script can be modified.
-# To connect to dask, the scheduler JSON file must be provided.  This can be done
-# using the `--dask_scheduler_file` argument in the mg python script being run.
-
-from ogb.nodeproppred import NodePropPredDataset
-
-import time
-import argparse
-import gc
-import warnings
-
-import torch
-import numpy as np
-
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-import torch.distributed as td
-import torch.multiprocessing as tmp
-from torch.nn.parallel import DistributedDataParallel as ddp
-
-from typing import List
-
-
-class CuGraphSAGE(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
-        super().__init__()
-
-        self.convs = torch.nn.ModuleList()
-        self.convs.append(CuGraphSAGEConv(in_channels, hidden_channels))
-        for _ in range(num_layers - 1):
-            conv = CuGraphSAGEConv(hidden_channels, hidden_channels)
-            self.convs.append(conv)
-
-        self.lin = nn.Linear(hidden_channels, out_channels)
-
-    def forward(self, x, edge, size):
-        edge_csc = CuGraphSAGEConv.to_csc(edge, (size[0], size[0]))
-        for conv in self.convs:
-            x = conv(x, edge_csc)[: size[1]]
-            x = F.relu(x)
-            x = F.dropout(x, p=0.5)
-
-        return self.lin(x)
-
-
-def enable_cudf_spilling():
-    import cudf
-
-    cudf.set_option("spill", True)
-
-
-def init_pytorch_worker(rank, devices, manager_ip, manager_port) -> None:
-    import cupy
-    import rmm
-
-    device_id = devices[rank]
-
-    rmm.reinitialize(
-        devices=[device_id],
-        pool_allocator=False,
-    )
-
-    # torch.cuda.change_current_allocator(rmm.rmm_torch_allocator)
-    # cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
-
-    cupy.cuda.Device(device_id).use()
-    torch.cuda.set_device(device_id)
-
-    # Pytorch training worker initialization
-    dist_init_method = f"tcp://{manager_ip}:{manager_port}"
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-        init_method=dist_init_method,
-        world_size=len(devices),
-        rank=rank,
-    )
-
-    # enable_cudf_spilling()
-
-
-def start_cugraph_dask_client(rank, dask_scheduler_file):
-    print(
-        "Connecting to dask... "
-        "(warning: this may take a while depending on your configuration)"
-    )
-    start_time_connect_dask = time.perf_counter_ns()
-    from distributed import Client
-    from cugraph.dask.comms import comms as Comms
-
-    client = Client(scheduler_file=dask_scheduler_file)
-    Comms.initialize(p2p=True)
-
-    end_time_connect_dask = time.perf_counter_ns()
-    print(
-        f"Successfully connected to dask on rank {rank}, took "
-        f"{(end_time_connect_dask - start_time_connect_dask) / 1e9:3.4f} s"
-    )
-    return client
-
-
-def stop_cugraph_dask_client():
-    from cugraph.dask.comms import comms as Comms
-
-    Comms.destroy()
-
-    from dask.distributed import get_client
-
-    get_client().close()
-
-
-def train(
-    rank,
-    torch_devices: List[int],
-    manager_ip: str,
-    manager_port: int,
-    dask_scheduler_file: str,
-    num_epochs: int,
-    features_on_gpu=True,
-) -> None:
-    """
-    Parameters
-    ----------
-    device: int
-        The CUDA device where the model, graph data, and node labels will be stored.
-    features_on_gpu: bool
-        Whether to store a replica of features on each worker's GPU.  If False,
-        all features will be stored on the CPU.
-    """
-
-    start_time_preprocess = time.perf_counter_ns()
-
-    world_size = len(torch_devices)
-    device_id = torch_devices[rank]
-    features_device = device_id if features_on_gpu else "cpu"
-    init_pytorch_worker(rank, torch_devices, manager_ip, manager_port)
-    td.barrier()
-
-    client = start_cugraph_dask_client(rank, dask_scheduler_file)
-
-    from distributed import Event as Dask_Event
-
-    event = Dask_Event("cugraph_store_creation_event")
-    download_event = Dask_Event("dataset_download_event")
-
-    td.barrier()
-
-    import cugraph
-    from cugraph_pyg.data import DaskGraphStore
-    from cugraph_pyg.loader import DaskNeighborLoader
-
-    if rank == 0:
-        print("Rank 0 downloading dataset")
-        dataset = NodePropPredDataset(name="ogbn-mag")
-        data = dataset[0]
-        download_event.set()
-        print("Dataset downloaded")
-    else:
-        if download_event.wait(timeout=1000):
-            print(f"Rank {rank} loading dataset")
-            dataset = NodePropPredDataset(name="ogbn-mag")
-            data = dataset[0]
-            print(f"Rank {rank} loaded dataset successfully")
-
-    ei = data[0]["edge_index_dict"][("paper", "cites", "paper")]
-    G = {
-        ("paper", "cites", "paper"): np.stack(
-            [np.concatenate([ei[0], ei[1]]), np.concatenate([ei[1], ei[0]])]
-        )
-    }
-    N = {"paper": data[0]["num_nodes_dict"]["paper"]}
-
-    fs = cugraph.gnn.FeatureStore(backend="torch")
-
-    fs.add_data(
-        torch.as_tensor(data[0]["node_feat_dict"]["paper"], device=features_device),
-        "paper",
-        "x",
-    )
-
-    fs.add_data(torch.as_tensor(data[1]["paper"].T[0], device=device_id), "paper", "y")
-
-    num_papers = data[0]["num_nodes_dict"]["paper"]
-
-    if rank == 0:
-        train_perc = 0.1
-        all_train_nodes = torch.randperm(num_papers)
-        all_train_nodes = all_train_nodes[: int(train_perc * num_papers)]
-        train_nodes = all_train_nodes[: int(len(all_train_nodes) / world_size)]
-
-        train_mask = torch.full((num_papers,), -1, device=device_id)
-        train_mask[train_nodes] = 1
-        fs.add_data(train_mask, "paper", "train")
-
-    print(f"Rank {rank} finished loading graph and feature data")
-
-    if rank == 0:
-        print("Rank 0 creating its cugraph store and initializing distributed graph")
-        # Rank 0 will initialize the distributed cugraph graph.
-        cugraph_store_create_start = time.perf_counter_ns()
-        print("G:", G[("paper", "cites", "paper")].shape)
-        cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
-        cugraph_store_create_end = time.perf_counter_ns()
-        print(
-            "cuGraph Store created on rank 0 in "
-            f"{(cugraph_store_create_end - cugraph_store_create_start) / 1e9:3.4f} s"
-        )
-        client.publish_dataset(train_nodes=all_train_nodes)
-        event.set()
-        print("Rank 0 done with cugraph store creation")
-    else:
-        if event.wait(timeout=1000):
-            print(f"Rank {rank} creating cugraph store")
-            train_nodes = client.get_dataset("train_nodes")
-            train_nodes = train_nodes[
-                int(rank * len(train_nodes) / world_size) : int(
-                    (rank + 1) * len(train_nodes) / world_size
-                )
-            ]
-
-            train_mask = torch.full((num_papers,), -1, device=device_id)
-            train_mask[train_nodes] = 1
-            fs.add_data(train_mask, "paper", "train")
-
-            # Will automatically use the stored distributed cugraph graph on rank 0.
-            cugraph_store_create_start = time.perf_counter_ns()
-            cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
-            cugraph_store_create_end = time.perf_counter_ns()
-            print(
-                f"Rank {rank} created cugraph store in "
-                f"{(cugraph_store_create_end - cugraph_store_create_start) / 1e9:3.4f}"
-                " s"
-            )
-            print(f"Rank {rank} done with cugraph store creation")
-
-    end_time_preprocess = time.perf_counter_ns()
-    print(f"rank {rank}: train {train_nodes.shape}", flush=True)
-    print(
-        f"rank {rank}: all preprocessing took"
-        f" {(end_time_preprocess - start_time_preprocess) / 1e9:3.4f}",
-        flush=True,
-    )
-    td.barrier()
-    model = (
-        CuGraphSAGE(in_channels=128, hidden_channels=64, out_channels=349, num_layers=3)
-        .to(torch.float32)
-        .to(device_id)
-    )
-    model = ddp(model, device_ids=[device_id], output_device=device_id)
-    td.barrier()
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-
-    for epoch in range(num_epochs):
-        start_time_train = time.perf_counter_ns()
-        model.train()
-
-        start_time_loader = time.perf_counter_ns()
-        cugraph_bulk_loader = DaskNeighborLoader(
-            cugraph_store,
-            train_nodes,
-            batch_size=250,
-            num_neighbors=[10, 10, 10],
-            seeds_per_call=1000,
-            batches_per_partition=2,
-            replace=False,
-        )
-        end_time_loader = time.perf_counter_ns()
-        total_time_loader = (end_time_loader - start_time_loader) / 1e9
-
-        total_loss = 0
-        num_batches = 0
-
-        print(f"rank {rank} starting epoch {epoch}")
-        with td.algorithms.join.Join([model]):
-            total_time_sample = 0
-            total_time_forward = 0
-            total_time_backward = 0
-
-            start_time_sample = time.perf_counter_ns()
-            for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
-                end_time_sample = time.perf_counter_ns()
-                total_time_sample += (end_time_sample - start_time_sample) / 1e9
-                num_batches += 1
-
-                if iter_i % 20 == 0:
-                    print(f"iteration {iter_i}")
-
-                # train
-                train_mask = hetero_data.train_dict["paper"]
-                y_true = hetero_data.y_dict["paper"]
-
-                start_time_forward = time.perf_counter_ns()
-                y_pred = model(
-                    hetero_data.x_dict["paper"].to(device_id).to(torch.float32),
-                    hetero_data.edge_index_dict[("paper", "cites", "paper")].to(
-                        device_id
-                    ),
-                    (len(y_true), len(y_true)),
-                )
-                end_time_forward = time.perf_counter_ns()
-                total_time_forward += (end_time_forward - start_time_forward) / 1e9
-
-                y_true = F.one_hot(
-                    y_true[train_mask].to(torch.int64), num_classes=349
-                ).to(torch.float32)
-
-                y_pred = y_pred[train_mask]
-
-                loss = F.cross_entropy(y_pred, y_true)
-
-                start_time_backward = time.perf_counter_ns()
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-                end_time_backward = time.perf_counter_ns()
-                total_time_backward += (end_time_backward - start_time_backward) / 1e9
-
-                total_loss += loss.item()
-
-                del y_true
-                del y_pred
-                del loss
-                del hetero_data
-                gc.collect()
-
-                start_time_sample = time.perf_counter_ns()
-
-            end_time_train = time.perf_counter_ns()
-            print(
-                f"epoch {epoch} "
-                f"total time: {(end_time_train - start_time_train) / 1e9:3.4f} s"
-                f"\nloader create time per batch: {total_time_loader / num_batches} s"
-                f"\nsampling/load time per batch: {total_time_sample / num_batches} s"
-                f"\nforward time per batch: {total_time_forward / num_batches} s"
-                f"\nbackward time per batch: {total_time_backward / num_batches} s"
-                f"\nnum batches: {num_batches}"
-            )
-            print(f"loss after epoch {epoch}: {total_loss / num_batches}")
-
-    td.barrier()
-    if rank == 0:
-        print("DONE", flush=True)
-        client.unpublish_dataset("train_nodes")
-        event.clear()
-
-    td.destroy_process_group()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--torch_devices",
-        type=str,
-        default="0,1",
-        help="GPU to allocate to pytorch for model, graph data, and node label storage",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--num_epochs",
-        type=int,
-        default=1,
-        help="Number of training epochs",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--features_on_gpu",
-        type=bool,
-        default=True,
-        help="Whether to store the features on each worker's GPU",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--torch_manager_ip",
-        type=str,
-        default="127.0.0.1",
-        help="The torch distributed manager ip address",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--torch_manager_port",
-        type=str,
-        default="12346",
-        help="The torch distributed manager port",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--dask_scheduler_file",
-        type=str,
-        help="The path to the dask scheduler file",
-        required=False,
-        default=None,
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    if args.dask_scheduler_file is None:
-        warnings.warn(
-            "You must provide the dask scheduler file " "to run this example.  Exiting."
-        )
-
-    else:
-        torch_devices = [int(d) for d in args.torch_devices.split(",")]
-
-        train_args = (
-            torch_devices,
-            args.torch_manager_ip,
-            args.torch_manager_port,
-            args.dask_scheduler_file,
-            args.num_epochs,
-            args.features_on_gpu,
-        )
-
-        tmp.spawn(train, args=train_args, nprocs=len(torch_devices))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
deleted file mode 100644
index e0169ee2c25..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import time
-import argparse
-import gc
-
-import torch
-
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-from typing import Union
-
-
-class CuGraphSAGE(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
-        super().__init__()
-
-        self.convs = torch.nn.ModuleList()
-        self.convs.append(CuGraphSAGEConv(in_channels, hidden_channels))
-        for _ in range(num_layers - 1):
-            conv = CuGraphSAGEConv(hidden_channels, hidden_channels)
-            self.convs.append(conv)
-
-        self.lin = nn.Linear(hidden_channels, out_channels)
-
-    def forward(self, x, edge, size):
-        edge_csc = CuGraphSAGEConv.to_csc(edge, (size[0], size[0]))
-        for conv in self.convs:
-            x = conv(x, edge_csc)[: size[1]]
-            x = F.relu(x)
-            x = F.dropout(x, p=0.5)
-
-        return self.lin(x)
-
-
-def init_pytorch_worker(device_id: int) -> None:
-    import cupy
-    import rmm
-
-    rmm.reinitialize(
-        devices=[device_id],
-        pool_allocator=False,
-    )
-
-    cupy.cuda.Device(device_id).use()
-    torch.cuda.set_device(device_id)
-
-
-def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -> None:
-    """
-    Parameters
-    ----------
-    device: int
-        The CUDA device where the model, graph data, and node labels will be stored.
-    features_device: Union[str, int]
-        The device (CUDA device or CPU) where features will be stored.
-    """
-
-    init_pytorch_worker(device)
-
-    import cugraph
-    from cugraph_pyg.data import DaskGraphStore
-    from cugraph_pyg.loader import DaskNeighborLoader
-
-    from ogb.nodeproppred import NodePropPredDataset
-
-    dataset = NodePropPredDataset(name="ogbn-mag")
-    data = dataset[0]
-
-    G = data[0]["edge_index_dict"]
-    N = data[0]["num_nodes_dict"]
-
-    fs = cugraph.gnn.FeatureStore(backend="torch")
-
-    fs.add_data(
-        torch.as_tensor(data[0]["node_feat_dict"]["paper"], device=features_device),
-        "paper",
-        "x",
-    )
-
-    fs.add_data(torch.as_tensor(data[1]["paper"].T[0], device=device), "paper", "y")
-
-    num_papers = data[0]["num_nodes_dict"]["paper"]
-    train_perc = 0.1
-
-    train_nodes = torch.randperm(num_papers)
-    train_nodes = train_nodes[: int(train_perc * num_papers)]
-
-    train_mask = torch.full((num_papers,), -1, device=device)
-    train_mask[train_nodes] = 1
-
-    fs.add_data(train_mask, "paper", "train")
-
-    cugraph_store = DaskGraphStore(fs, G, N)
-
-    model = (
-        CuGraphSAGE(in_channels=128, hidden_channels=64, out_channels=349, num_layers=3)
-        .to(torch.float32)
-        .to(device)
-    )
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-
-    for epoch in range(num_epochs):
-        start_time_train = time.perf_counter_ns()
-        model.train()
-
-        cugraph_bulk_loader = DaskNeighborLoader(
-            cugraph_store, train_nodes, batch_size=500, num_neighbors=[10, 25]
-        )
-
-        total_loss = 0
-        num_batches = 0
-
-        # This context manager will handle different # batches per rank
-        # barrier() cannot do this since the number of ops per rank is
-        # different.  It essentially acts like barrier would if the
-        # number of ops per rank was the same.
-        for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
-            num_batches += 1
-            if iter_i % 20 == 0:
-                print(f"iteration {iter_i}")
-
-            # train
-            train_mask = hetero_data.train_dict["paper"]
-            y_true = hetero_data.y_dict["paper"]
-
-            y_pred = model(
-                hetero_data.x_dict["paper"].to(device).to(torch.float32),
-                hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device),
-                (len(y_true), len(y_true)),
-            )
-
-            y_true = F.one_hot(y_true[train_mask].to(torch.int64), num_classes=349).to(
-                torch.float32
-            )
-
-            y_pred = y_pred[train_mask]
-
-            loss = F.cross_entropy(y_pred, y_true)
-
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            total_loss += loss.item()
-
-            del y_true
-            del y_pred
-            del loss
-            del hetero_data
-            gc.collect()
-
-        end_time_train = time.perf_counter_ns()
-        print(
-            f"epoch {epoch} time: "
-            f"{(end_time_train - start_time_train) / 1e9:3.4f} s"
-        )
-        print(f"loss after epoch {epoch}: {total_loss / num_batches}")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--device",
-        type=int,
-        default=0,
-        help="GPU to allocate to pytorch for model, graph data, and node label storage",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--features_device",
-        type=str,
-        default="0",
-        help="Device to allocate to pytorch for feature storage",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--num_epochs",
-        type=int,
-        default=1,
-        help="Number of training epochs",
-        required=False,
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    try:
-        features_device = int(args.features_device)
-    except ValueError:
-        features_device = args.features_device
-
-    train(args.device, features_device, args.num_epochs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_mg.py
deleted file mode 100644
index 832c5ec74f0..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_mg.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph nccl-only comms, pylibcuGraph,
-# and PyTorch DDP to run a multi-GPU workflow.  Most users of the
-# GNN packages will not interact with cuGraph directly.  This example
-# is intented for users who want to extend cuGraph within a DDP workflow.
-
-import os
-
-import pandas
-import numpy as np
-import torch
-import torch.multiprocessing as tmp
-import torch.distributed as dist
-
-import cudf
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_get_raft_handle,
-)
-
-from pylibcugraph import MGGraph, ResourceHandle, GraphProperties, degrees
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def init_pytorch(rank, world_size):
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-
-def calc_degree(rank: int, world_size: int, uid, edgelist):
-    init_pytorch(rank, world_size)
-
-    device = rank
-    cugraph_comms_init(rank, world_size, uid, device)
-
-    print(f"rank {rank} initialized cugraph")
-
-    src = cudf.Series(np.array_split(edgelist[0], world_size)[rank])
-    dst = cudf.Series(np.array_split(edgelist[1], world_size)[rank])
-
-    seeds = cudf.Series(np.arange(rank * 50, (rank + 1) * 50))
-    handle = ResourceHandle(cugraph_comms_get_raft_handle().getHandle())
-
-    print("constructing graph")
-    G = MGGraph(
-        handle,
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        [src],
-        [dst],
-    )
-    print("graph constructed")
-
-    print("calculating degrees")
-    vertices, in_deg, out_deg = degrees(handle, G, seeds, do_expensive_check=False)
-    print("degrees calculated")
-
-    print("constructing dataframe")
-    df = pandas.DataFrame(
-        {"v": vertices.get(), "in": in_deg.get(), "out": out_deg.get()}
-    )
-    print(df)
-
-    dist.barrier()
-    cugraph_comms_shutdown()
-    print(f"rank {rank} shut down cugraph")
-
-
-def main():
-    world_size = torch.cuda.device_count()
-    uid = cugraph_comms_create_unique_id()
-
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-
-    tmp.spawn(
-        calc_degree,
-        args=(world_size, uid, el),
-        nprocs=world_size,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_sg.py
deleted file mode 100644
index 2f273ee581e..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_sg.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph and pylibcuGraph to run a
-# single-GPU workflow.  Most users of the GNN packages will not interact
-# with cuGraph directly.  This example is intented for users who want
-# to extend cuGraph within a PyTorch workflow.
-
-import pandas
-import numpy as np
-
-import cudf
-
-from pylibcugraph import SGGraph, ResourceHandle, GraphProperties, degrees
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def calc_degree(edgelist):
-    src = cudf.Series(edgelist[0])
-    dst = cudf.Series(edgelist[1])
-
-    seeds = cudf.Series(np.arange(256))
-
-    print("constructing graph")
-    G = SGGraph(
-        ResourceHandle(),
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        src,
-        dst,
-    )
-    print("graph constructed")
-
-    print("calculating degrees")
-    vertices, in_deg, out_deg = degrees(
-        ResourceHandle(), G, seeds, do_expensive_check=False
-    )
-    print("degrees calculated")
-
-    print("constructing dataframe")
-    df = pandas.DataFrame(
-        {"v": vertices.get(), "in": in_deg.get(), "out": out_deg.get()}
-    )
-    print(df)
-
-    print("done")
-
-
-def main():
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-    calc_degree(el)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py
deleted file mode 100644
index 5c75e01e6f5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py
+++ /dev/null
@@ -1,418 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example illustrates link classification using the ogbl-wikikg2 dataset.
-
-import os
-import json
-import argparse
-import warnings
-
-import torch
-
-import torch.nn.functional as F
-from torch.nn import Parameter
-from torch_geometric.nn import FastRGCNConv, GAE
-from torch.nn.parallel import DistributedDataParallel
-
-from ogb.linkproppred import PygLinkPropPredDataset
-
-import cugraph_pyg
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_shutdown,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(global_rank, local_rank, world_size, uid):
-    import rmm
-
-    rmm.reinitialize(devices=[local_rank], pool_allocator=True, managed_memory=True)
-
-    import cupy
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    cugraph_comms_init(
-        global_rank,
-        world_size,
-        uid,
-        local_rank,
-    )
-
-    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
-
-    enable_spilling()
-
-
-class RGCNEncoder(torch.nn.Module):
-    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
-        super().__init__()
-        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
-        self.conv1 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.conv2 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        torch.nn.init.xavier_uniform_(self.node_emb)
-        self.conv1.reset_parameters()
-        self.conv2.reset_parameters()
-
-    def forward(self, edge_index, edge_type):
-        x = self.node_emb
-        x = self.conv1(x, edge_index, edge_type).relu_()
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv2(x, edge_index, edge_type)
-        return x
-
-
-def train(epoch, model, optimizer, train_loader, edge_feature_store, num_steps=None):
-    model.train()
-    optimizer.zero_grad()
-
-    for i, batch in enumerate(train_loader):
-        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
-        z = model.encode(batch.edge_index, r)
-
-        loss = model.recon_loss(z, batch.edge_index)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(
-                f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}", flush=True
-            )
-        if num_steps and i == num_steps:
-            break
-
-
-def test(stage, epoch, model, loader, num_steps=None):
-    # TODO support ROC-AUC metric
-    # Predict probabilities of future edges
-    model.eval()
-
-    rr = 0.0
-    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
-        if num_steps and i >= num_steps:
-            break
-
-        ei = torch.concatenate(
-            [
-                torch.stack([h, t]).cuda(),
-                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
-            ],
-            dim=-1,
-        )
-
-        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
-
-        z = model.encode(ei, r)
-        q = model.decode(z, ei)
-
-        _, ix = torch.sort(q, descending=True)
-        rr += 1.0 / (1.0 + ix[0])
-
-    print(f"epoch {epoch:02d} {stage} mrr:", rr / i, flush=True)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=128)
-    parser.add_argument("--num_layers", type=int, default=1)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=16384)
-    parser.add_argument("--num_neg", type=int, default=500)
-    parser.add_argument("--num_pos", type=int, default=-1)
-    parser.add_argument("--fan_out", type=int, default=10)
-    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-    parser.add_argument("--n_devices", type=int, default=-1)
-    parser.add_argument("--skip_partition", action="store_true")
-
-    return parser.parse_args()
-
-
-def run_train(rank, world_size, model, data, edge_feature_store, meta, splits, args):
-    model = model.to(rank)
-    model = GAE(DistributedDataParallel(model, device_ids=[rank]))
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    eli = torch.stack([splits["train"]["head"], splits["train"]["tail"]])
-
-    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
-        data,
-        [args.fan_out] * args.num_layers,
-        edge_label_index=eli,
-        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-    )
-
-    def get_eval_loader(stage: str):
-        head = splits[stage]["head"]
-        tail = splits[stage]["tail"]
-
-        head_neg = splits[stage]["head_neg"][:, : args.num_neg]
-        tail_neg = splits[stage]["tail_neg"][:, : args.num_neg]
-
-        rel = splits[stage]["relation"]
-
-        return torch.utils.data.DataLoader(
-            torch.utils.data.TensorDataset(
-                head.pin_memory(),
-                head_neg.pin_memory(),
-                tail.pin_memory(),
-                tail_neg.pin_memory(),
-                rel.pin_memory(),
-            ),
-            batch_size=1,
-            shuffle=False,
-            drop_last=True,
-        )
-
-    test_loader = get_eval_loader("test")
-    valid_loader = get_eval_loader("valid")
-
-    num_train_steps = (args.num_pos // args.batch_size) if args.num_pos > 0 else 100
-
-    for epoch in range(1, 1 + args.epochs):
-        train(
-            epoch,
-            model,
-            optimizer,
-            train_loader,
-            edge_feature_store,
-            num_steps=num_train_steps,
-        )
-        test("validation", epoch, model, valid_loader, num_steps=1024)
-
-    test("test", epoch, model, test_loader, num_steps=1024)
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-def partition_data(
-    data, splits, meta, edge_path, rel_path, pos_path, neg_path, meta_path
-):
-    # Split and save edge index
-    os.makedirs(
-        edge_path,
-        exist_ok=True,
-    )
-    for (r, e) in enumerate(torch.tensor_split(data.edge_index, world_size, dim=1)):
-        rank_path = os.path.join(edge_path, f"rank={r}.pt")
-        torch.save(
-            e.clone(),
-            rank_path,
-        )
-
-    # Split and save edge reltypes
-    os.makedirs(
-        rel_path,
-        exist_ok=True,
-    )
-    for (r, f) in enumerate(torch.tensor_split(data.edge_reltype, world_size)):
-        rank_path = os.path.join(rel_path, f"rank={r}.pt")
-        torch.save(
-            f.clone(),
-            rank_path,
-        )
-
-    # Split and save positive edges
-    os.makedirs(
-        pos_path,
-        exist_ok=True,
-    )
-    for stage in ["train", "test", "valid"]:
-        for (r, n) in enumerate(
-            torch.tensor_split(
-                torch.stack([splits[stage]["head"], splits[stage]["tail"]]),
-                world_size,
-                dim=-1,
-            )
-        ):
-            rank_path = os.path.join(pos_path, f"rank={r}_{stage}.pt")
-            torch.save(
-                n.clone(),
-                rank_path,
-            )
-
-    # Split and save negative edges
-    os.makedirs(
-        neg_path,
-        exist_ok=True,
-    )
-    for stage in ["test", "valid"]:
-        for (r, n) in enumerate(
-            torch.tensor_split(
-                torch.stack([splits[stage]["head_neg"], splits[stage]["tail_neg"]]),
-                world_size,
-                dim=1,
-            )
-        ):
-            rank_path = os.path.join(neg_path, f"rank={r}_{stage}.pt")
-            torch.save(n.clone(), rank_path)
-        for (r, n) in enumerate(
-            torch.tensor_split(splits[stage]["relation"], world_size, dim=-1)
-        ):
-            print(n)
-            rank_path = os.path.join(neg_path, f"rank={r}_{stage}_relation.pt")
-            torch.save(n.clone(), rank_path)
-
-    with open(meta_path, "w") as f:
-        json.dump(meta, f)
-
-
-def load_partitioned_data(rank, edge_path, rel_path, pos_path, neg_path, meta_path):
-    from cugraph_pyg.data import GraphStore, WholeFeatureStore, TensorDictFeatureStore
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-    edge_feature_store = WholeFeatureStore()
-
-    # Load edge index
-    graph_store[("n", "e", "n"), "coo"] = torch.load(
-        os.path.join(edge_path, f"rank={rank}.pt")
-    )
-
-    # Load edge rel type
-    edge_feature_store[("n", "e", "n"), "rel"] = torch.load(
-        os.path.join(rel_path, f"rank={rank}.pt")
-    )
-
-    splits = {}
-
-    # Load positive edges
-    for stage in ["train", "test", "valid"]:
-        head, tail = torch.load(os.path.join(pos_path, f"rank={rank}_{stage}.pt"))
-        splits[stage] = {
-            "head": head,
-            "tail": tail,
-        }
-
-    # Load negative edges
-    for stage in ["test", "valid"]:
-        head_neg, tail_neg = torch.load(
-            os.path.join(neg_path, f"rank={rank}_{stage}.pt")
-        )
-        relation = torch.load(
-            os.path.join(neg_path, f"rank={rank}_{stage}_relation.pt")
-        )
-        splits[stage]["head_neg"] = head_neg
-        splits[stage]["tail_neg"] = tail_neg
-        splits[stage]["relation"] = relation
-
-    with open(meta_path, "r") as f:
-        meta = json.load(f)
-
-    return (feature_store, graph_store), edge_feature_store, splits, meta
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    if "LOCAL_RANK" in os.environ:
-        torch.distributed.init_process_group("nccl")
-        world_size = torch.distributed.get_world_size()
-        global_rank = torch.distributed.get_rank()
-        local_rank = int(os.environ["LOCAL_RANK"])
-        device = torch.device(local_rank)
-
-        # Create the uid needed for cuGraph comms
-        if global_rank == 0:
-            cugraph_id = [cugraph_comms_create_unique_id()]
-        else:
-            cugraph_id = [None]
-        torch.distributed.broadcast_object_list(cugraph_id, src=0, device=device)
-        cugraph_id = cugraph_id[0]
-
-        init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id)
-
-        # Split the data
-        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
-        rel_path = os.path.join(args.dataset_root, args.dataset + "_rel_part")
-        pos_path = os.path.join(args.dataset_root, args.dataset + "_e_pos_part")
-        neg_path = os.path.join(args.dataset_root, args.dataset + "_e_neg_part")
-        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
-
-        if not args.skip_partition and global_rank == 0:
-            data = PygLinkPropPredDataset(args.dataset, root=args.dataset_root)
-            dataset = data[0]
-
-            splits = data.get_edge_split()
-
-            meta = {}
-            meta["num_nodes"] = int(dataset.num_nodes)
-            meta["num_rels"] = int(dataset.edge_reltype.max()) + 1
-
-            partition_data(
-                dataset,
-                splits,
-                meta,
-                edge_path=edge_path,
-                rel_path=rel_path,
-                pos_path=pos_path,
-                neg_path=neg_path,
-                meta_path=meta_path,
-            )
-            del data
-            del dataset
-            del splits
-        torch.distributed.barrier()
-
-        # Load partitions
-        data, edge_feature_store, splits, meta = load_partitioned_data(
-            rank=global_rank,
-            edge_path=edge_path,
-            rel_path=rel_path,
-            pos_path=pos_path,
-            neg_path=neg_path,
-            meta_path=meta_path,
-        )
-        torch.distributed.barrier()
-
-        model = RGCNEncoder(
-            meta["num_nodes"],
-            hidden_channels=args.hidden_channels,
-            num_relations=meta["num_rels"],
-        )
-
-        run_train(
-            global_rank, world_size, model, data, edge_feature_store, meta, splits, args
-        )
-    else:
-        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py
deleted file mode 100644
index 67d7eecc7c2..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example illustrates link classification using the ogbl-wikikg2 dataset.
-
-import argparse
-
-from typing import Tuple, Dict, Any
-
-import torch
-import cupy
-
-import rmm
-from rmm.allocators.cupy import rmm_cupy_allocator
-from rmm.allocators.torch import rmm_torch_allocator
-
-# Must change allocators immediately upon import
-# or else other imports will cause memory to be
-# allocated and prevent changing the allocator
-rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
-cupy.cuda.set_allocator(rmm_cupy_allocator)
-torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-import torch.nn.functional as F  # noqa: E402
-from torch.nn import Parameter  # noqa: E402
-from torch_geometric.nn import FastRGCNConv, GAE  # noqa: E402
-import torch_geometric  # noqa: E402
-import cugraph_pyg  # noqa: E402
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling  # noqa: E402
-
-enable_spilling()
-
-
-class RGCNEncoder(torch.nn.Module):
-    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
-        super().__init__()
-        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
-        self.conv1 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.conv2 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        torch.nn.init.xavier_uniform_(self.node_emb)
-        self.conv1.reset_parameters()
-        self.conv2.reset_parameters()
-
-    def forward(self, edge_index, edge_type):
-        x = self.node_emb
-        x = self.conv1(x, edge_index, edge_type).relu_()
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv2(x, edge_index, edge_type)
-        return x
-
-
-def load_data(
-    dataset_str, dataset_root: str
-) -> Tuple[
-    Tuple["torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"],
-    "torch_geometric.data.FeatureStore",
-    Dict[str, Dict[str, "torch.Tensor"]],
-    Dict[str, Any],
-]:
-    from ogb.linkproppred import PygLinkPropPredDataset
-
-    data = PygLinkPropPredDataset(dataset_str, root=dataset_root)
-    dataset = data[0]
-
-    splits = data.get_edge_split()
-
-    from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-    edge_feature_store = TensorDictFeatureStore()
-    meta = {}
-
-    graph_store[("n", "e", "n"), "coo"] = dataset.edge_index
-    edge_feature_store[("n", "e", "n"), "rel"] = dataset.edge_reltype.pin_memory()
-    meta["num_nodes"] = dataset.num_nodes
-    meta["num_rels"] = dataset.edge_reltype.max() + 1
-
-    return (feature_store, graph_store), edge_feature_store, splits, meta
-
-
-def train(epoch, model, optimizer, train_loader, edge_feature_store):
-    model.train()
-    optimizer.zero_grad()
-
-    for i, batch in enumerate(train_loader):
-        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
-        z = model.encode(batch.edge_index, r)
-
-        loss = model.recon_loss(z, batch.edge_index)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}")
-            if i == 100:
-                break
-
-
-def test(stage, epoch, model, loader, num_steps=None):
-    # TODO support ROC-AUC metric
-    # Predict probabilities of future edges
-    model.eval()
-
-    rr = 0.0
-    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
-        if num_steps and i >= num_steps:
-            break
-
-        ei = torch.concatenate(
-            [
-                torch.stack([h, t]).cuda(),
-                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
-            ],
-            dim=-1,
-        )
-
-        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
-
-        z = model.encode(ei, r)
-        q = model.decode(z, ei)
-
-        _, ix = torch.sort(q, descending=True)
-        rr += 1.0 / (1.0 + ix[0])
-
-    print(f"epoch {epoch:02d} {stage} mrr:", rr / i)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=128)
-    parser.add_argument("--num_layers", type=int, default=1)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=16384)
-    parser.add_argument("--num_neg", type=int, default=500)
-    parser.add_argument("--fan_out", type=int, default=10)
-    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    data, edge_feature_store, splits, meta = load_data(args.dataset, args.dataset_root)
-
-    model = GAE(
-        RGCNEncoder(
-            meta["num_nodes"],
-            hidden_channels=args.hidden_channels,
-            num_relations=meta["num_rels"],
-        )
-    ).cuda()
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
-        data,
-        [args.fan_out] * args.num_layers,
-        edge_label_index=torch.stack(
-            [splits["train"]["head"], splits["train"]["tail"]]
-        ),
-        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-    )
-
-    def get_eval_loader(stage: str):
-        head = splits[stage]["head"]
-        tail = splits[stage]["tail"]
-
-        head_neg = splits[stage]["head_neg"][:, : args.num_neg]
-        tail_neg = splits[stage]["tail_neg"][:, : args.num_neg]
-
-        rel = splits[stage]["relation"]
-
-        return torch.utils.data.DataLoader(
-            torch.utils.data.TensorDataset(
-                head.pin_memory(),
-                head_neg.pin_memory(),
-                tail.pin_memory(),
-                tail_neg.pin_memory(),
-                rel.pin_memory(),
-            ),
-            batch_size=1,
-            shuffle=False,
-            drop_last=True,
-        )
-
-    test_loader = get_eval_loader("test")
-    valid_loader = get_eval_loader("valid")
-
-    for epoch in range(1, 1 + args.epochs):
-        train(epoch, model, optimizer, train_loader, edge_feature_store)
-        test("validation", epoch, model, valid_loader, num_steps=1024)
-
-    test("test", epoch, model, test_loader, num_steps=1024)
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py
deleted file mode 100644
index 2c0ae53a08e..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example illustrates link classification using the ogbl-wikikg2 dataset.
-
-import os
-import argparse
-import warnings
-
-from typing import Tuple, Any
-
-import torch
-
-import torch.nn.functional as F
-from torch.nn import Parameter
-from torch_geometric.nn import FastRGCNConv, GAE
-from torch.nn.parallel import DistributedDataParallel
-
-import torch_geometric
-import cugraph_pyg
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_shutdown,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(rank, world_size, uid):
-    import rmm
-
-    rmm.reinitialize(devices=[rank], pool_allocator=True, managed_memory=True)
-
-    import cupy
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    cugraph_comms_init(
-        rank,
-        world_size,
-        uid,
-        rank,
-    )
-
-    wm_init(rank, world_size, rank, world_size)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group(
-        "nccl",
-        rank=rank,
-        world_size=world_size,
-    )
-
-    enable_spilling()
-
-
-class RGCNEncoder(torch.nn.Module):
-    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
-        super().__init__()
-        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
-        self.conv1 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.conv2 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        torch.nn.init.xavier_uniform_(self.node_emb)
-        self.conv1.reset_parameters()
-        self.conv2.reset_parameters()
-
-    def forward(self, edge_index, edge_type):
-        x = self.node_emb
-        x = self.conv1(x, edge_index, edge_type).relu_()
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv2(x, edge_index, edge_type)
-        return x
-
-
-def load_data(
-    rank: int,
-    world_size: int,
-    data: Any,
-) -> Tuple[
-    Tuple["torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"],
-    "torch_geometric.data.FeatureStore",
-]:
-    from cugraph_pyg.data import GraphStore, WholeFeatureStore, TensorDictFeatureStore
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()  # empty fs required by PyG
-    edge_feature_store = WholeFeatureStore()
-
-    graph_store[("n", "e", "n"), "coo"] = torch.tensor_split(
-        data.edge_index.cuda(), world_size, dim=1
-    )[rank]
-
-    edge_feature_store[("n", "e", "n"), "rel"] = torch.tensor_split(
-        data.edge_reltype.cuda(),
-        world_size,
-    )[rank]
-
-    return (feature_store, graph_store), edge_feature_store
-
-
-def train(epoch, model, optimizer, train_loader, edge_feature_store, num_steps=None):
-    model.train()
-    optimizer.zero_grad()
-
-    for i, batch in enumerate(train_loader):
-        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
-        z = model.encode(batch.edge_index, r)
-
-        loss = model.recon_loss(z, batch.edge_index)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(
-                f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}", flush=True
-            )
-        if num_steps and i == num_steps:
-            break
-
-
-def test(stage, epoch, model, loader, num_steps=None):
-    # TODO support ROC-AUC metric
-    # Predict probabilities of future edges
-    model.eval()
-
-    rr = 0.0
-    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
-        if num_steps and i >= num_steps:
-            break
-
-        ei = torch.concatenate(
-            [
-                torch.stack([h, t]).cuda(),
-                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
-            ],
-            dim=-1,
-        )
-
-        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
-
-        z = model.encode(ei, r)
-        q = model.decode(z, ei)
-
-        _, ix = torch.sort(q, descending=True)
-        rr += 1.0 / (1.0 + ix[0])
-
-    print(f"epoch {epoch:02d} {stage} mrr:", rr / i, flush=True)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=128)
-    parser.add_argument("--num_layers", type=int, default=1)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=16384)
-    parser.add_argument("--num_neg", type=int, default=500)
-    parser.add_argument("--num_pos", type=int, default=-1)
-    parser.add_argument("--fan_out", type=int, default=10)
-    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-    parser.add_argument("--n_devices", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-def run_train(rank, world_size, uid, model, data, meta, splits, args):
-    init_pytorch_worker(
-        rank,
-        world_size,
-        uid,
-    )
-
-    model = model.to(rank)
-    model = GAE(DistributedDataParallel(model, device_ids=[rank]))
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    data, edge_feature_store = load_data(rank, world_size, data)
-
-    eli = torch.stack(
-        [
-            torch.tensor_split(splits["train"]["head"], world_size)[rank],
-            torch.tensor_split(splits["train"]["tail"], world_size)[rank],
-        ]
-    )
-
-    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
-        data,
-        [args.fan_out] * args.num_layers,
-        edge_label_index=eli,
-        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-    )
-
-    def get_eval_loader(stage: str):
-        head = torch.tensor_split(splits[stage]["head"], world_size)[rank]
-        tail = torch.tensor_split(splits[stage]["tail"], world_size)[rank]
-
-        head_neg = torch.tensor_split(
-            splits[stage]["head_neg"][:, : args.num_neg], world_size
-        )[rank]
-        tail_neg = torch.tensor_split(
-            splits[stage]["tail_neg"][:, : args.num_neg], world_size
-        )[rank]
-
-        rel = torch.tensor_split(splits[stage]["relation"], world_size)[rank]
-
-        return torch.utils.data.DataLoader(
-            torch.utils.data.TensorDataset(
-                head.pin_memory(),
-                head_neg.pin_memory(),
-                tail.pin_memory(),
-                tail_neg.pin_memory(),
-                rel.pin_memory(),
-            ),
-            batch_size=1,
-            shuffle=False,
-            drop_last=True,
-        )
-
-    test_loader = get_eval_loader("test")
-    valid_loader = get_eval_loader("valid")
-
-    num_train_steps = (args.num_pos // args.batch_size) if args.num_pos > 0 else 100
-
-    for epoch in range(1, 1 + args.epochs):
-        train(
-            epoch,
-            model,
-            optimizer,
-            train_loader,
-            edge_feature_store,
-            num_steps=num_train_steps,
-        )
-        test("validation", epoch, model, valid_loader, num_steps=1024)
-
-    test("test", epoch, model, test_loader, num_steps=1024)
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-if __name__ == "__main__":
-    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
-        warnings.warn("Skipping SMNG example in CI due to memory limit")
-    else:
-        args = parse_args()
-
-        # change the allocator before any allocations are made
-        from rmm.allocators.torch import rmm_torch_allocator
-
-        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-        # import ogb here to stop it from creating a context and breaking pytorch/rmm
-        from ogb.linkproppred import PygLinkPropPredDataset
-
-        data = PygLinkPropPredDataset(args.dataset, root=args.dataset_root)
-        dataset = data[0]
-
-        splits = data.get_edge_split()
-
-        meta = {}
-        meta["num_nodes"] = dataset.num_nodes
-        meta["num_rels"] = dataset.edge_reltype.max() + 1
-
-        model = RGCNEncoder(
-            meta["num_nodes"],
-            hidden_channels=args.hidden_channels,
-            num_relations=meta["num_rels"],
-        )
-
-        print("Data =", data)
-        if args.n_devices == -1:
-            world_size = torch.cuda.device_count()
-        else:
-            world_size = args.n_devices
-        print("Using", world_size, "GPUs...")
-
-        uid = cugraph_comms_create_unique_id()
-        torch.multiprocessing.spawn(
-            run_train,
-            (world_size, uid, model, data, meta, splits, args),
-            nprocs=world_size,
-            join=True,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/start_dask.sh b/python/cugraph-pyg/cugraph_pyg/examples/start_dask.sh
deleted file mode 100755
index 54c82f81298..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/start_dask.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-WORKER_RMM_POOL_SIZE=14G \
-CUDA_VISIBLE_DEVICES=0,1 \
-SCHEDULER_FILE=$(pwd)/scheduler.json \
-../../../../mg_utils/run-dask-process.sh \
-    scheduler workers \
-    --tcp
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
deleted file mode 100644
index c804b3d1f97..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph_pyg.loader.node_loader import NodeLoader
-from cugraph_pyg.loader.neighbor_loader import NeighborLoader
-
-from cugraph_pyg.loader.link_loader import LinkLoader
-from cugraph_pyg.loader.link_neighbor_loader import LinkNeighborLoader
-
-from cugraph_pyg.loader.dask_node_loader import DaskNeighborLoader
-
-from cugraph_pyg.loader.dask_node_loader import BulkSampleLoader
-
-
-def CuGraphNeighborLoader(*args, **kwargs):
-    warnings.warn(
-        "CuGraphNeighborLoader has been renamed to DaskNeighborLoader", FutureWarning
-    )
-    return DaskNeighborLoader(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
deleted file mode 100644
index 9b24281b190..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
+++ /dev/null
@@ -1,558 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import os
-import re
-import warnings
-
-import cupy
-import cudf
-
-from cugraph.gnn import BulkSampler
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.sampler_utils import (
-    _sampler_output_from_sampling_results_heterogeneous,
-    _sampler_output_from_sampling_results_homogeneous_csr,
-    _sampler_output_from_sampling_results_homogeneous_coo,
-)
-
-from typing import Union, Tuple, Sequence, List, Dict
-
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-InputNodes = (
-    Sequence
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.typing.InputNodes
-)
-
-
-class BulkSampleLoader:
-    """
-    Iterator that executes sampling using Dask and cuGraph and
-    loads sampled minibatches from disk.
-    """
-
-    __ex_parquet_file = re.compile(r"batch=([0-9]+)\-([0-9]+)\.parquet")
-
-    def __init__(
-        self,
-        feature_store: DaskGraphStore,
-        graph_store: DaskGraphStore,
-        input_nodes: InputNodes = None,
-        batch_size: int = 0,
-        *,
-        shuffle: bool = False,
-        drop_last: bool = True,
-        edge_types: Sequence[Tuple[str]] = None,
-        directory: Union[str, tempfile.TemporaryDirectory] = None,
-        input_files: List[str] = None,
-        starting_batch_id: int = 0,
-        batches_per_partition: int = 100,
-        # Sampler args
-        num_neighbors: Union[List[int], Dict[Tuple[str, str, str], List[int]]] = None,
-        replace: bool = True,
-        compression: str = "COO",
-        # Other kwargs for the BulkSampler
-        **kwargs,
-    ):
-        """
-        Executes a bulk sampling job immediately upon creation.
-        Allows iteration over the returned results.
-
-        Parameters
-        ----------
-        feature_store: DaskGraphStore
-            The feature store containing features for the graph.
-
-        graph_store: DaskGraphStore
-            The graph store containing the graph structure.
-
-        input_nodes: InputNodes
-            The input nodes associated with this sampler.
-            If None, this loader will load batches
-            from disk rather than performing sampling in memory.
-
-        batch_size: int
-            The number of input nodes per sampling batch.
-            Generally required unless loading already-sampled
-            data from disk.
-
-        shuffle: bool (optional, default=False)
-            Whether to shuffle the input indices.
-            If True, will shuffle the input indices.
-            If False, will create batches in the original order.
-
-        edge_types: Sequence[Tuple[str]] (optional, default=None)
-            The desired edge types for the subgraph.
-            Defaults to all edges in the graph.
-
-        directory: str (optional, default=new tempdir)
-            The path of the directory to write samples to.
-            Defaults to a new generated temporary directory.
-
-        input_files: List[str] (optional, default=None)
-            The input files to read from the directory containing
-            samples.  This argument is only used when loading
-            alread-sampled batches from disk.
-
-        starting_batch_id: int (optional, default=0)
-            The starting id for each batch.  Defaults to 0.
-
-        batches_per_partition: int (optional, default=100)
-            The number of batches in each output partition.
-            Defaults to 100.  Gets passed to the bulk
-            sampler if there is one; otherwise, this argument
-            is used to determine which files to read.
-
-        num_neighbors: Union[List[int],
-                 Dict[Tuple[str, str, str], List[int]]] (required)
-            The number of neighbors to sample for each node in each iteration.
-            If an entry is set to -1, all neighbors will be included.
-            In heterogeneous graphs, may also take in a dictionary denoting
-            the number of neighbors to sample for each individual edge type.
-
-            Note: in cuGraph, only one value of num_neighbors is currently supported.
-            Passing in a dictionary will result in an exception.
-        """
-
-        self.__feature_store = feature_store
-        self.__graph_store = graph_store
-        self.__next_batch = -1
-        self.__end_exclusive = -1
-        self.__batches_per_partition = batches_per_partition
-        self.__starting_batch_id = starting_batch_id
-
-        self._total_read_time = 0.0
-        self._total_convert_time = 0.0
-        self._total_feature_time = 0.0
-
-        if input_nodes is None:
-            # Will be loading from disk
-            self.__num_batches = input_nodes
-            self.__directory = directory
-            if input_files is None:
-                if isinstance(self.__directory, str):
-                    self.__input_files = iter(os.listdir(self.__directory))
-                else:
-                    self.__input_files = iter(os.listdir(self.__directory.name))
-            else:
-                self.__input_files = iter(input_files)
-            return
-
-        # To accommodate DLFW/PyG 2.5
-        get_input_nodes = torch_geometric.loader.utils.get_input_nodes
-        get_input_nodes_kwargs = {}
-        if "input_id" in get_input_nodes.__annotations__:
-            get_input_nodes_kwargs["input_id"] = None
-        input_node_info = get_input_nodes(
-            (feature_store, graph_store), input_nodes, **get_input_nodes_kwargs
-        )
-
-        # PyG 2.4
-        if len(input_node_info) == 2:
-            input_type, input_nodes = input_node_info
-        # PyG 2.5
-        elif len(input_node_info) == 3:
-            input_type, input_nodes, input_id = input_node_info
-        # Invalid
-        else:
-            raise ValueError("Invalid output from get_input_nodes")
-
-        if input_type is not None:
-            input_nodes = graph_store._get_sample_from_vertex_groups(
-                {input_type: input_nodes}
-            )
-
-        if batch_size is None or batch_size < 1:
-            raise ValueError("Batch size must be >= 1")
-
-        self.__directory = (
-            tempfile.TemporaryDirectory() if directory is None else directory
-        )
-
-        if isinstance(num_neighbors, dict):
-            raise ValueError("num_neighbors dict is currently unsupported!")
-
-        if "renumber" in kwargs:
-            warnings.warn(
-                "Setting renumbering manually could result in invalid output,"
-                " please ensure you intended to do this."
-            )
-            renumber = kwargs.pop("renumber")
-        else:
-            renumber = (
-                True
-                if (
-                    (len(self.__graph_store.node_types) == 1)
-                    and (len(self.__graph_store.edge_types) == 1)
-                )
-                else False
-            )
-
-        bulk_sampler = BulkSampler(
-            batch_size,
-            self.__directory
-            if isinstance(self.__directory, str)
-            else self.__directory.name,
-            self.__graph_store._subgraph(edge_types),
-            fanout_vals=num_neighbors,
-            with_replacement=replace,
-            batches_per_partition=self.__batches_per_partition,
-            renumber=renumber,
-            use_legacy_names=False,
-            deduplicate_sources=True,
-            prior_sources_behavior="exclude",
-            include_hop_column=(compression == "COO"),
-            **kwargs,
-        )
-
-        # Make sure indices are in cupy
-        input_nodes = cupy.asarray(input_nodes)
-
-        # Shuffle
-        if shuffle:
-            cupy.random.shuffle(input_nodes)
-
-        # Truncate if we can't evenly divide the input array
-        stop = (len(input_nodes) // batch_size) * batch_size
-        input_nodes, remainder = cupy.array_split(input_nodes, [stop])
-
-        # Split into batches
-        input_nodes = cupy.split(input_nodes, max(len(input_nodes) // batch_size, 1))
-
-        if not drop_last:
-            input_nodes.append(remainder)
-
-        self.__num_batches = 0
-        for batch_num, batch_i in enumerate(input_nodes):
-            batch_len = len(batch_i)
-            if batch_len > 0:
-                self.__num_batches += 1
-                bulk_sampler.add_batches(
-                    cudf.DataFrame(
-                        {
-                            "start": batch_i,
-                            "batch": cupy.full(
-                                batch_len, batch_num + starting_batch_id, dtype="int32"
-                            ),
-                        }
-                    ),
-                    start_col_name="start",
-                    batch_col_name="batch",
-                )
-
-        bulk_sampler.flush()
-        self.__input_files = iter(
-            os.listdir(
-                self.__directory
-                if isinstance(self.__directory, str)
-                else self.__directory.name
-            )
-        )
-
-    def __next__(self):
-        from time import perf_counter
-
-        start_time_read_data = perf_counter()
-
-        # Load the next set of sampling results if necessary
-        if self.__next_batch >= self.__end_exclusive:
-            if self.__directory is None:
-                raise StopIteration
-
-            # Read the next parquet file into memory
-            dir_path = (
-                self.__directory
-                if isinstance(self.__directory, str)
-                else self.__directory.name
-            )
-
-            # Will raise StopIteration if there are no files left
-            try:
-                fname = next(self.__input_files)
-            except StopIteration as ex:
-                # Won't delete a non-temp dir (since it would just be deleting a string)
-                del self.__directory
-                self.__directory = None
-                raise StopIteration(ex)
-
-            m = self.__ex_parquet_file.match(fname)
-            if m is None:
-                raise ValueError(f"Invalid parquet filename {fname}")
-
-            self.__start_inclusive, end_inclusive = [int(g) for g in m.groups()]
-            self.__next_batch = self.__start_inclusive
-            self.__end_exclusive = end_inclusive + 1
-
-            parquet_path = os.path.join(
-                dir_path,
-                fname,
-            )
-
-            raw_sample_data = cudf.read_parquet(parquet_path)
-
-            if "map" in raw_sample_data.columns:
-                if "renumber_map_offsets" not in raw_sample_data.columns:
-                    num_batches = end_inclusive - self.__start_inclusive + 1
-
-                    map_end = raw_sample_data["map"].iloc[num_batches]
-
-                    map = torch.as_tensor(
-                        raw_sample_data["map"].iloc[0:map_end], device="cuda"
-                    )
-                    raw_sample_data.drop("map", axis=1, inplace=True)
-
-                    self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
-                    self.__renumber_map = map[num_batches + 1 :]
-                else:
-                    self.__renumber_map = raw_sample_data["map"]
-                    self.__renumber_map_offsets = raw_sample_data[
-                        "renumber_map_offsets"
-                    ]
-                    raw_sample_data.drop(
-                        columns=["map", "renumber_map_offsets"], inplace=True
-                    )
-
-                    self.__renumber_map.dropna(inplace=True)
-                    self.__renumber_map = torch.as_tensor(
-                        self.__renumber_map, device="cuda"
-                    )
-
-                    self.__renumber_map_offsets.dropna(inplace=True)
-                    self.__renumber_map_offsets = torch.as_tensor(
-                        self.__renumber_map_offsets, device="cuda"
-                    )
-
-            else:
-                self.__renumber_map = None
-
-            self.__data = raw_sample_data
-            self.__coo = "majors" in self.__data.columns
-            if self.__coo:
-                self.__data.dropna(inplace=True)
-
-            if (
-                len(self.__graph_store.edge_types) == 1
-                and len(self.__graph_store.node_types) == 1
-            ):
-                if self.__coo:
-                    group_cols = ["batch_id", "hop_id"]
-                    self.__data_index = self.__data.groupby(
-                        group_cols, as_index=True
-                    ).agg({"majors": "max", "minors": "max"})
-                    self.__data_index.rename(
-                        columns={"majors": "src_max", "minors": "dst_max"},
-                        inplace=True,
-                    )
-                    self.__data_index = self.__data_index.to_dict(orient="index")
-                else:
-                    self.__data_index = None
-
-                    self.__label_hop_offsets = self.__data["label_hop_offsets"]
-                    self.__data.drop(columns=["label_hop_offsets"], inplace=True)
-                    self.__label_hop_offsets.dropna(inplace=True)
-                    self.__label_hop_offsets = torch.as_tensor(
-                        self.__label_hop_offsets, device="cuda"
-                    )
-                    self.__label_hop_offsets -= self.__label_hop_offsets[0].clone()
-
-                    self.__major_offsets = self.__data["major_offsets"]
-                    self.__data.drop(columns="major_offsets", inplace=True)
-                    self.__major_offsets.dropna(inplace=True)
-                    self.__major_offsets = torch.as_tensor(
-                        self.__major_offsets, device="cuda"
-                    )
-                    self.__major_offsets -= self.__major_offsets[0].clone()
-
-                    self.__minors = self.__data["minors"]
-                    self.__data.drop(columns="minors", inplace=True)
-                    self.__minors.dropna(inplace=True)
-                    self.__minors = torch.as_tensor(self.__minors, device="cuda")
-
-                    num_batches = self.__end_exclusive - self.__start_inclusive
-                    offsets_len = len(self.__label_hop_offsets) - 1
-                    if offsets_len % num_batches != 0:
-                        raise ValueError("invalid label-hop offsets")
-                    self.__fanout_length = int(offsets_len / num_batches)
-
-        end_time_read_data = perf_counter()
-        self._total_read_time += end_time_read_data - start_time_read_data
-
-        # Pull the next set of sampling results out of the dataframe in memory
-        if self.__coo:
-            f = self.__data["batch_id"] == self.__next_batch
-        if self.__renumber_map is not None:
-            i = self.__next_batch - self.__start_inclusive
-
-            # this should avoid d2h copy
-            current_renumber_map = self.__renumber_map[
-                self.__renumber_map_offsets[i] : self.__renumber_map_offsets[i + 1]
-            ]
-
-        else:
-            current_renumber_map = None
-
-        start_time_convert = perf_counter()
-        # Get and return the sampled subgraph
-        if (
-            len(self.__graph_store.edge_types) == 1
-            and len(self.__graph_store.node_types) == 1
-        ):
-            if self.__coo:
-                sampler_output = _sampler_output_from_sampling_results_homogeneous_coo(
-                    self.__data[f],
-                    current_renumber_map,
-                    self.__graph_store,
-                    self.__data_index,
-                    self.__next_batch,
-                )
-            else:
-                i = (self.__next_batch - self.__start_inclusive) * self.__fanout_length
-                current_label_hop_offsets = self.__label_hop_offsets[
-                    i : i + self.__fanout_length + 1
-                ]
-
-                current_major_offsets = self.__major_offsets[
-                    current_label_hop_offsets[0] : (current_label_hop_offsets[-1] + 1)
-                ]
-
-                current_minors = self.__minors[
-                    current_major_offsets[0] : current_major_offsets[-1]
-                ]
-
-                sampler_output = _sampler_output_from_sampling_results_homogeneous_csr(
-                    current_major_offsets,
-                    current_minors,
-                    current_renumber_map,
-                    self.__graph_store,
-                    current_label_hop_offsets,
-                    self.__data_index,
-                    self.__next_batch,
-                )
-        else:
-            sampler_output = _sampler_output_from_sampling_results_heterogeneous(
-                self.__data[f], current_renumber_map, self.__graph_store
-            )
-
-        # Get ready for next iteration
-        self.__next_batch += 1
-
-        end_time_convert = perf_counter()
-        self._total_convert_time += end_time_convert - start_time_convert
-
-        start_time_feature = perf_counter()
-        # Create a PyG HeteroData object, loading the required features
-        if self.__graph_store != self.__feature_store:
-            # TODO Possibly support this if there is an actual use case
-            raise ValueError("Separate graph and feature stores currently unsupported")
-
-        out = self.__graph_store.filter(
-            "COO" if self.__coo else "CSC",
-            sampler_output.node,
-            sampler_output.row,
-            sampler_output.col,
-            sampler_output.edge,
-        )
-
-        # Account for CSR format in cuGraph vs. CSC format in PyG
-        # TODO deprecate and remove this functionality
-        if self.__coo and self.__graph_store.order == "CSC":
-            for edge_type in out.edge_index_dict:
-                out[edge_type].edge_index = out[edge_type].edge_index.flip(dims=[0])
-
-        out.set_value_dict("num_sampled_nodes", sampler_output.num_sampled_nodes)
-        out.set_value_dict("num_sampled_edges", sampler_output.num_sampled_edges)
-
-        end_time_feature = perf_counter()
-        self._total_feature_time = end_time_feature - start_time_feature
-
-        return out
-
-    @property
-    def _starting_batch_id(self):
-        return self.__starting_batch_id
-
-    def __iter__(self):
-        return self
-
-
-class DaskNeighborLoader:
-    """
-    Duck-typed version of the PyG NeighborLoader interface that uses
-    Dask to sample nodes using the uniform neighbor sampling algorithm.
-    """
-
-    def __init__(
-        self,
-        data: Union[DaskGraphStore, Tuple[DaskGraphStore, DaskGraphStore]],
-        input_nodes: Union[InputNodes, int] = None,
-        batch_size: int = None,
-        **kwargs,
-    ):
-        """
-        Constructs a new DaskNeighborLoader object.
-
-        Parameters
-        ----------
-        data: DaskGraphStore or (DaskGraphStore, DaskGraphStore)
-            The DaskGraphStore or stores where the graph/feature data is held.
-
-        batch_size: int (required)
-            The number of input nodes in each batch.
-
-        input_nodes: Union[InputNodes, int] (required)
-            The input nodes associated with this sampler.
-
-        **kwargs: kwargs
-            Keyword arguments to pass through for sampling.
-            i.e. "shuffle", "fanout"
-            See BulkSampleLoader.
-        """
-
-        if input_nodes is None:
-            raise ValueError("input_nodes is required")
-        if batch_size is None:
-            raise ValueError("batch_size is required")
-
-        # Allow passing in a feature store and graph store as a tuple, as
-        # in the standard PyG API.  If only one is passed, it is assumed
-        # it is behaving as both a graph store and a feature store.
-        if isinstance(data, (list, tuple)):
-            self.__feature_store, self.__graph_store = data
-        else:
-            self.__feature_store = data
-            self.__graph_store = data
-
-        self.__batch_size = batch_size
-        self.__input_nodes = input_nodes
-        self.inner_loader_args = kwargs
-
-    @property
-    def batch_size(self) -> int:
-        return self.__batch_size
-
-    def __iter__(self):
-        self.current_loader = BulkSampleLoader(
-            self.__feature_store,
-            self.__graph_store,
-            self.__input_nodes,
-            self.__batch_size,
-            **self.inner_loader_args,
-        )
-
-        return self.current_loader
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py
deleted file mode 100644
index 77e2ac4f99d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-import cugraph_pyg
-from typing import Union, Tuple, Callable, Optional
-
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-
-
-class LinkLoader:
-    """
-    Duck-typed version of torch_geometric.loader.LinkLoader.
-    Loads samples from batches of input nodes using a
-    `~cugraph_pyg.sampler.BaseSampler.sample_from_edges`
-    function.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        link_sampler: "cugraph_pyg.sampler.BaseSampler",
-        edge_label_index: "torch_geometric.typing.InputEdges" = None,
-        edge_label: "torch_geometric.typing.OptTensor" = None,
-        edge_label_time: "torch_geometric.typing.OptTensor" = None,
-        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"] = None,
-        neg_sampling_ratio: Optional[Union[int, float]] = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        filter_per_worker: Optional[bool] = None,
-        custom_cls: Optional["torch_geometric.data.HeteroData"] = None,
-        input_id: "torch_geometric.typing.OptTensor" = None,
-        batch_size: int = 1,  # refers to number of edges in batch
-        shuffle: bool = False,
-        drop_last: bool = False,
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-                See torch_geometric.loader.NodeLoader.
-            link_sampler: BaseSampler
-                See torch_geometric.loader.LinkLoader.
-            edge_label_index: InputEdges
-                See torch_geometric.loader.LinkLoader.
-            edge_label: OptTensor
-                See torch_geometric.loader.LinkLoader.
-            edge_label_time: OptTensor
-                See torch_geometric.loader.LinkLoader.
-            neg_sampling: Optional[NegativeSampling]
-                Type of negative sampling to perform, if desired.
-                See torch_geometric.loader.LinkLoader.
-            neg_sampling_ratio: Optional[Union[int, float]]
-                Negative sampling ratio.  Affects how many negative
-                samples are generated.
-                See torch_geometric.loader.LinkLoader.
-            transform: Callable (optional, default=None)
-                This argument currently has no effect.
-            transform_sampler_output: Callable (optional, default=None)
-                This argument currently has no effect.
-            filter_per_worker: bool (optional, default=False)
-                This argument currently has no effect.
-            custom_cls: HeteroData
-                This argument currently has no effect.  This loader will
-                always return a Data or HeteroData object.
-            input_id: OptTensor
-                See torch_geometric.loader.LinkLoader.
-
-        """
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if not isinstance(link_sampler, cugraph_pyg.sampler.BaseSampler):
-            raise NotImplementedError("Must provide a cuGraph sampler")
-
-        if edge_label_time is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-
-        if filter_per_worker:
-            warnings.warn("filter_per_worker is currently ignored")
-
-        if custom_cls is not None:
-            warnings.warn("custom_cls is currently ignored")
-
-        if transform is not None:
-            warnings.warn("transform is currently ignored.")
-
-        if transform_sampler_output is not None:
-            warnings.warn("transform_sampler_output is currently ignored.")
-
-        if neg_sampling_ratio is not None:
-            warnings.warn(
-                "The 'neg_sampling_ratio' argument is deprecated in PyG"
-                " and is not supported in cuGraph-PyG."
-            )
-
-        neg_sampling = torch_geometric.sampler.NegativeSampling.cast(neg_sampling)
-
-        (
-            input_type,
-            edge_label_index,
-        ) = torch_geometric.loader.utils.get_edge_label_index(
-            data,
-            (None, edge_label_index),
-        )
-
-        self.__input_data = torch_geometric.sampler.EdgeSamplerInput(
-            input_id=torch.arange(
-                edge_label_index[0].numel(), dtype=torch.int64, device="cuda"
-            )
-            if input_id is None
-            else input_id,
-            row=edge_label_index[0],
-            col=edge_label_index[1],
-            label=edge_label,
-            time=edge_label_time,
-            input_type=input_type,
-        )
-
-        # Edge label check from torch_geometric.loader.LinkLoader
-        if (
-            neg_sampling is not None
-            and neg_sampling.is_binary()
-            and edge_label is not None
-            and edge_label.min() == 0
-        ):
-            edge_label = edge_label + 1
-
-        if (
-            neg_sampling is not None
-            and neg_sampling.is_triplet()
-            and edge_label is not None
-        ):
-            raise ValueError(
-                "'edge_label' needs to be undefined for "
-                "'triplet'-based negative sampling. Please use "
-                "`src_index`, `dst_pos_index` and "
-                "`neg_pos_index` of the returned mini-batch "
-                "instead to differentiate between positive and "
-                "negative samples."
-            )
-
-        self.__data = data
-
-        self.__link_sampler = link_sampler
-        self.__neg_sampling = neg_sampling
-
-        self.__batch_size = batch_size
-        self.__shuffle = shuffle
-        self.__drop_last = drop_last
-
-    def __iter__(self):
-        if self.__shuffle:
-            perm = torch.randperm(self.__input_data.row.numel())
-        else:
-            perm = torch.arange(self.__input_data.row.numel())
-
-        if self.__drop_last:
-            d = perm.numel() % self.__batch_size
-            perm = perm[:-d]
-
-        input_data = torch_geometric.sampler.EdgeSamplerInput(
-            input_id=self.__input_data.input_id[perm],
-            row=self.__input_data.row[perm],
-            col=self.__input_data.col[perm],
-            label=None
-            if self.__input_data.label is None
-            else self.__input_data.label[perm],
-            time=None
-            if self.__input_data.time is None
-            else self.__input_data.time[perm],
-            input_type=self.__input_data.input_type,
-        )
-
-        return cugraph_pyg.sampler.SampleIterator(
-            self.__data,
-            self.__link_sampler.sample_from_edges(
-                input_data,
-                neg_sampling=self.__neg_sampling,
-            ),
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py
deleted file mode 100644
index 080565368c4..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Tuple, Optional, Callable, List, Dict
-
-import cugraph_pyg
-from cugraph_pyg.loader import LinkLoader
-from cugraph_pyg.sampler import BaseSampler
-
-from cugraph.gnn import NeighborSampler, DistSampleWriter
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-
-
-class LinkNeighborLoader(LinkLoader):
-    """
-    Duck-typed version of torch_geometric.loader.LinkNeighborLoader
-
-    Link loader that implements the neighbor sampling
-    algorithm used in GraphSAGE.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        num_neighbors: Union[
-            List[int], Dict["torch_geometric.typing.EdgeType", List[int]]
-        ],
-        edge_label_index: "torch_geometric.typing.InputEdges" = None,
-        edge_label: "torch_geometric.typing.OptTensor" = None,
-        edge_label_time: "torch_geometric.typing.OptTensor" = None,
-        replace: bool = False,
-        subgraph_type: Union[
-            "torch_geometric.typing.SubgraphType", str
-        ] = "directional",
-        disjoint: bool = False,
-        temporal_strategy: str = "uniform",
-        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"] = None,
-        neg_sampling_ratio: Optional[Union[int, float]] = None,
-        time_attr: Optional[str] = None,
-        weight_attr: Optional[str] = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        is_sorted: bool = False,
-        filter_per_worker: Optional[bool] = None,
-        neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
-        directed: bool = True,  # Deprecated.
-        batch_size: int = 16,  # Refers to number of edges per batch.
-        directory: Optional[str] = None,
-        batches_per_partition=256,
-        format: str = "parquet",
-        compression: Optional[str] = None,
-        local_seeds_per_call: Optional[int] = None,
-        **kwargs,
-    ):
-        """
-        data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-            See torch_geometric.loader.LinkNeighborLoader.
-        num_neighbors: List[int] or Dict[EdgeType, List[int]]
-            Fanout values.
-            See torch_geometric.loader.LinkNeighborLoader.
-        edge_label_index: InputEdges
-            Input edges for sampling.
-            See torch_geometric.loader.LinkNeighborLoader.
-        edge_label: OptTensor
-            Labels for input edges.
-            See torch_geometric.loader.LinkNeighborLoader.
-        edge_label_time: OptTensor
-            Time attribute for input edges.
-            See torch_geometric.loader.LinkNeighborLoader.
-        replace: bool (optional, default=False)
-            Whether to sample with replacement.
-            See torch_geometric.loader.LinkNeighborLoader.
-        subgraph_type: Union[SubgraphType, str] (optional, default='directional')
-            The type of subgraph to return.
-            Currently only 'directional' is supported.
-            See torch_geometric.loader.LinkNeighborLoader.
-        disjoint: bool (optional, default=False)
-            Whether to perform disjoint sampling.
-            Currently unsupported.
-            See torch_geometric.loader.LinkNeighborLoader.
-        temporal_strategy: str (optional, default='uniform')
-            Currently only 'uniform' is suppported.
-            See torch_geometric.loader.LinkNeighborLoader.
-        time_attr: str (optional, default=None)
-            Used for temporal sampling.
-            See torch_geometric.loader.LinkNeighborLoader.
-        weight_attr: str (optional, default=None)
-            Used for biased sampling.
-            See torch_geometric.loader.LinkNeighborLoader.
-        transform: Callable (optional, default=None)
-            See torch_geometric.loader.LinkNeighborLoader.
-        transform_sampler_output: Callable (optional, default=None)
-            See torch_geometric.loader.LinkNeighborLoader.
-        is_sorted: bool (optional, default=False)
-            Ignored by cuGraph.
-            See torch_geometric.loader.LinkNeighborLoader.
-        filter_per_worker: bool (optional, default=False)
-            Currently ignored by cuGraph, but this may
-            change once in-memory sampling is implemented.
-            See torch_geometric.loader.LinkNeighborLoader.
-        neighbor_sampler: torch_geometric.sampler.NeighborSampler
-            (optional, default=None)
-            Not supported by cuGraph.
-            See torch_geometric.loader.LinkNeighborLoader.
-        directed: bool (optional, default=True)
-            Deprecated.
-            See torch_geometric.loader.LinkNeighborLoader.
-        batch_size: int (optional, default=16)
-            The number of input nodes per output minibatch.
-            See torch.utils.dataloader.
-        directory: str (optional, default=None)
-            The directory where samples will be temporarily stored,
-            if spilling samples to disk.  If None, this loader
-            will perform buffered in-memory sampling.
-            If writing to disk, setting this argument
-            to a tempfile.TemporaryDirectory with a context
-            manager is a good option but depending on the filesystem,
-            you may want to choose an alternative location with fast I/O
-            intead.
-            See cugraph.gnn.DistSampleWriter.
-        batches_per_partition: int (optional, default=256)
-            The number of batches per partition if writing samples to
-            disk.  Manually tuning this parameter is not recommended
-            but reducing it may help conserve GPU memory.
-            See cugraph.gnn.DistSampleWriter.
-        format: str (optional, default='parquet')
-            If writing samples to disk, they will be written in this
-            file format.
-            See cugraph.gnn.DistSampleWriter.
-        compression: str (optional, default=None)
-            The compression type to use if writing samples to disk.
-            If not provided, it is automatically chosen.
-        local_seeds_per_call: int (optional, default=None)
-            The number of seeds to process within a single sampling call.
-            Manually tuning this parameter is not recommended but reducing
-            it may conserve GPU memory.  The total number of seeds processed
-            per sampling call is equal to the sum of this parameter across
-            all workers.  If not provided, it will be automatically
-            calculated.
-            See cugraph.gnn.DistSampler.
-        **kwargs
-            Other keyword arguments passed to the superclass.
-        """
-
-        subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
-
-        if not directed:
-            subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
-            warnings.warn(
-                "The 'directed' argument is deprecated. "
-                "Use subgraph_type='induced' instead."
-            )
-        if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
-            raise ValueError("Only directional subgraphs are currently supported")
-        if disjoint:
-            raise ValueError("Disjoint sampling is currently unsupported")
-        if temporal_strategy != "uniform":
-            warnings.warn("Only the uniform temporal strategy is currently supported")
-        if neighbor_sampler is not None:
-            raise ValueError("Passing a neighbor sampler is currently unsupported")
-        if time_attr is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-        if is_sorted:
-            warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if compression is None:
-            compression = "CSR"
-        elif compression not in ["CSR", "COO"]:
-            raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
-
-        writer = (
-            None
-            if directory is None
-            else DistSampleWriter(
-                directory=directory,
-                batches_per_partition=batches_per_partition,
-                format=format,
-            )
-        )
-
-        feature_store, graph_store = data
-
-        if weight_attr is not None:
-            graph_store._set_weight_attr((feature_store, weight_attr))
-
-        sampler = BaseSampler(
-            NeighborSampler(
-                graph_store._graph,
-                writer,
-                retain_original_seeds=True,
-                fanout=num_neighbors,
-                prior_sources_behavior="exclude",
-                deduplicate_sources=True,
-                compression=compression,
-                compress_per_hop=False,
-                with_replacement=replace,
-                local_seeds_per_call=local_seeds_per_call,
-                biased=(weight_attr is not None),
-            ),
-            (feature_store, graph_store),
-            batch_size=batch_size,
-        )
-        # TODO add heterogeneous support and pass graph_store._vertex_offsets
-
-        super().__init__(
-            (feature_store, graph_store),
-            sampler,
-            edge_label_index=edge_label_index,
-            edge_label=edge_label,
-            edge_label_time=edge_label_time,
-            neg_sampling=neg_sampling,
-            neg_sampling_ratio=neg_sampling_ratio,
-            transform=transform,
-            transform_sampler_output=transform_sampler_output,
-            filter_per_worker=filter_per_worker,
-            batch_size=batch_size,
-            **kwargs,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
deleted file mode 100644
index 1da2c6dc381..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Tuple, Optional, Callable, List, Dict
-
-import cugraph_pyg
-from cugraph_pyg.loader import NodeLoader
-from cugraph_pyg.sampler import BaseSampler
-
-from cugraph.gnn import NeighborSampler, DistSampleWriter
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-
-
-class NeighborLoader(NodeLoader):
-    """
-    Duck-typed version of torch_geometric.loader.NeighborLoader
-
-    Node loader that implements the neighbor sampling
-    algorithm used in GraphSAGE.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        num_neighbors: Union[
-            List[int], Dict["torch_geometric.typing.EdgeType", List[int]]
-        ],
-        input_nodes: "torch_geometric.typing.InputNodes" = None,
-        input_time: "torch_geometric.typing.OptTensor" = None,
-        replace: bool = False,
-        subgraph_type: Union[
-            "torch_geometric.typing.SubgraphType", str
-        ] = "directional",
-        disjoint: bool = False,
-        temporal_strategy: str = "uniform",
-        time_attr: Optional[str] = None,
-        weight_attr: Optional[str] = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        is_sorted: bool = False,
-        filter_per_worker: Optional[bool] = None,
-        neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
-        directed: bool = True,  # Deprecated.
-        batch_size: int = 16,
-        directory: Optional[str] = None,
-        batches_per_partition=256,
-        format: str = "parquet",
-        compression: Optional[str] = None,
-        local_seeds_per_call: Optional[int] = None,
-        **kwargs,
-    ):
-        """
-        data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-            See torch_geometric.loader.NeighborLoader.
-        num_neighbors: List[int] or Dict[EdgeType, List[int]]
-            Fanout values.
-            See torch_geometric.loader.NeighborLoader.
-        input_nodes: InputNodes
-            Input nodes for sampling.
-            See torch_geometric.loader.NeighborLoader.
-        input_time: OptTensor (optional)
-            See torch_geometric.loader.NeighborLoader.
-        replace: bool (optional, default=False)
-            Whether to sample with replacement.
-            See torch_geometric.loader.NeighborLoader.
-        subgraph_type: Union[SubgraphType, str] (optional, default='directional')
-            The type of subgraph to return.
-            Currently only 'directional' is supported.
-            See torch_geometric.loader.NeighborLoader.
-        disjoint: bool (optional, default=False)
-            Whether to perform disjoint sampling.
-            Currently unsupported.
-            See torch_geometric.loader.NeighborLoader.
-        temporal_strategy: str (optional, default='uniform')
-            Currently only 'uniform' is suppported.
-            See torch_geometric.loader.NeighborLoader.
-        time_attr: str (optional, default=None)
-            Used for temporal sampling.
-            See torch_geometric.loader.NeighborLoader.
-        weight_attr: str (optional, default=None)
-            Used for biased sampling.
-            See torch_geometric.loader.NeighborLoader.
-        transform: Callable (optional, default=None)
-            See torch_geometric.loader.NeighborLoader.
-        transform_sampler_output: Callable (optional, default=None)
-            See torch_geometric.loader.NeighborLoader.
-        is_sorted: bool (optional, default=False)
-            Ignored by cuGraph.
-            See torch_geometric.loader.NeighborLoader.
-        filter_per_worker: bool (optional, default=False)
-            Currently ignored by cuGraph, but this may
-            change once in-memory sampling is implemented.
-            See torch_geometric.loader.NeighborLoader.
-        neighbor_sampler: torch_geometric.sampler.NeighborSampler
-            (optional, default=None)
-            Not supported by cuGraph.
-            See torch_geometric.loader.NeighborLoader.
-        directed: bool (optional, default=True)
-            Deprecated.
-            See torch_geometric.loader.NeighborLoader.
-        batch_size: int (optional, default=16)
-            The number of input nodes per output minibatch.
-            See torch.utils.dataloader.
-        directory: str (optional, default=None)
-            The directory where samples will be temporarily stored,
-            if spilling samples to disk.  If None, this loader
-            will perform buffered in-memory sampling.
-            If writing to disk, setting this argument
-            to a tempfile.TemporaryDirectory with a context
-            manager is a good option but depending on the filesystem,
-            you may want to choose an alternative location with fast I/O
-            intead.
-            See cugraph.gnn.DistSampleWriter.
-        batches_per_partition: int (optional, default=256)
-            The number of batches per partition if writing samples to
-            disk.  Manually tuning this parameter is not recommended
-            but reducing it may help conserve GPU memory.
-            See cugraph.gnn.DistSampleWriter.
-        format: str (optional, default='parquet')
-            If writing samples to disk, they will be written in this
-            file format.
-            See cugraph.gnn.DistSampleWriter.
-        compression: str (optional, default=None)
-            The compression type to use if writing samples to disk.
-            If not provided, it is automatically chosen.
-        local_seeds_per_call: int (optional, default=None)
-            The number of seeds to process within a single sampling call.
-            Manually tuning this parameter is not recommended but reducing
-            it may conserve GPU memory.  The total number of seeds processed
-            per sampling call is equal to the sum of this parameter across
-            all workers.  If not provided, it will be automatically
-            calculated.
-            See cugraph.gnn.DistSampler.
-        **kwargs
-            Other keyword arguments passed to the superclass.
-        """
-
-        subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
-
-        if not directed:
-            subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
-            warnings.warn(
-                "The 'directed' argument is deprecated. "
-                "Use subgraph_type='induced' instead."
-            )
-        if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
-            raise ValueError("Only directional subgraphs are currently supported")
-        if disjoint:
-            raise ValueError("Disjoint sampling is currently unsupported")
-        if temporal_strategy != "uniform":
-            warnings.warn("Only the uniform temporal strategy is currently supported")
-        if neighbor_sampler is not None:
-            raise ValueError("Passing a neighbor sampler is currently unsupported")
-        if time_attr is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-        if is_sorted:
-            warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if compression is None:
-            compression = "CSR"
-        elif compression not in ["CSR", "COO"]:
-            raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
-
-        writer = (
-            None
-            if directory is None
-            else DistSampleWriter(
-                directory=directory,
-                batches_per_partition=batches_per_partition,
-                format=format,
-            )
-        )
-
-        feature_store, graph_store = data
-
-        if weight_attr is not None:
-            graph_store._set_weight_attr((feature_store, weight_attr))
-
-        sampler = BaseSampler(
-            NeighborSampler(
-                graph_store._graph,
-                writer,
-                retain_original_seeds=True,
-                fanout=num_neighbors,
-                prior_sources_behavior="exclude",
-                deduplicate_sources=True,
-                compression=compression,
-                compress_per_hop=False,
-                with_replacement=replace,
-                local_seeds_per_call=local_seeds_per_call,
-                biased=(weight_attr is not None),
-            ),
-            (feature_store, graph_store),
-            batch_size=batch_size,
-        )
-        # TODO add heterogeneous support and pass graph_store._vertex_offsets
-
-        super().__init__(
-            (feature_store, graph_store),
-            sampler,
-            input_nodes=input_nodes,
-            input_time=input_time,
-            transform=transform,
-            transform_sampler_output=transform_sampler_output,
-            filter_per_worker=filter_per_worker,
-            batch_size=batch_size,
-            **kwargs,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
deleted file mode 100644
index 4b236f75885..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-import cugraph_pyg
-from typing import Union, Tuple, Callable, Optional
-
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-
-
-class NodeLoader:
-    """
-    Duck-typed version of torch_geometric.loader.NodeLoader.
-    Loads samples from batches of input nodes using a
-    `~cugraph_pyg.sampler.BaseSampler.sample_from_nodes`
-    function.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        node_sampler: "cugraph_pyg.sampler.BaseSampler",
-        input_nodes: "torch_geometric.typing.InputNodes" = None,
-        input_time: "torch_geometric.typing.OptTensor" = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        filter_per_worker: Optional[bool] = None,
-        custom_cls: Optional["torch_geometric.data.HeteroData"] = None,
-        input_id: "torch_geometric.typing.OptTensor" = None,
-        batch_size: int = 1,
-        shuffle: bool = False,
-        drop_last: bool = False,
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-                See torch_geometric.loader.NodeLoader.
-            node_sampler: BaseSampler
-                See torch_geometric.loader.NodeLoader.
-            input_nodes: InputNodes
-                See torch_geometric.loader.NodeLoader.
-            input_time: OptTensor
-                See torch_geometric.loader.NodeLoader.
-            transform: Callable (optional, default=None)
-                This argument currently has no effect.
-            transform_sampler_output: Callable (optional, default=None)
-                This argument currently has no effect.
-            filter_per_worker: bool (optional, default=False)
-                This argument currently has no effect.
-            custom_cls: HeteroData
-                This argument currently has no effect.  This loader will
-                always return a Data or HeteroData object.
-            input_id: OptTensor
-                See torch_geometric.loader.NodeLoader.
-
-        """
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if not isinstance(node_sampler, cugraph_pyg.sampler.BaseSampler):
-            raise NotImplementedError("Must provide a cuGraph sampler")
-
-        if input_time is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-
-        if filter_per_worker:
-            warnings.warn("filter_per_worker is currently ignored")
-
-        if custom_cls is not None:
-            warnings.warn("custom_cls is currently ignored")
-
-        if transform is not None:
-            warnings.warn("transform is currently ignored.")
-
-        if transform_sampler_output is not None:
-            warnings.warn("transform_sampler_output is currently ignored.")
-
-        (
-            input_type,
-            input_nodes,
-            input_id,
-        ) = torch_geometric.loader.utils.get_input_nodes(
-            data,
-            input_nodes,
-            input_id,
-        )
-
-        self.__input_data = torch_geometric.sampler.NodeSamplerInput(
-            input_id=torch.arange(len(input_nodes), dtype=torch.int64, device="cuda")
-            if input_id is None
-            else input_id,
-            node=input_nodes,
-            time=None,
-            input_type=input_type,
-        )
-
-        self.__data = data
-
-        self.__node_sampler = node_sampler
-
-        self.__batch_size = batch_size
-        self.__shuffle = shuffle
-        self.__drop_last = drop_last
-
-    def __iter__(self):
-        if self.__shuffle:
-            perm = torch.randperm(self.__input_data.node.numel())
-        else:
-            perm = torch.arange(self.__input_data.node.numel())
-
-        if self.__drop_last:
-            d = perm.numel() % self.__batch_size
-            perm = perm[:-d]
-
-        input_data = torch_geometric.sampler.NodeSamplerInput(
-            input_id=self.__input_data.input_id[perm],
-            node=self.__input_data.node[perm],
-            time=None
-            if self.__input_data.time is None
-            else self.__input_data.time[perm],
-            input_type=self.__input_data.input_type,
-        )
-
-        return cugraph_pyg.sampler.SampleIterator(
-            self.__data, self.__node_sampler.sample_from_nodes(input_data)
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
deleted file mode 100644
index 331b49ebec0..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .conv import *
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
deleted file mode 100644
index bef3a023b93..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .gat_conv import GATConv
-from .gatv2_conv import GATv2Conv
-from .hetero_gat_conv import HeteroGATConv
-from .rgcn_conv import RGCNConv
-from .sage_conv import SAGEConv
-from .transformer_conv import TransformerConv
-
-__all__ = [
-    "GATConv",
-    "GATv2Conv",
-    "HeteroGATConv",
-    "RGCNConv",
-    "SAGEConv",
-    "TransformerConv",
-]
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
deleted file mode 100644
index 713448a8203..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-import pylibcugraphops.pytorch
-
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-# A tuple of (row, colptr, num_src_nodes)
-CSC = Tuple[torch.Tensor, torch.Tensor, int]
-
-
-class BaseConv(torch.nn.Module):  # pragma: no cover
-    r"""An abstract base class for implementing cugraph-ops message passing layers."""
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        pass
-
-    @staticmethod
-    def to_csc(
-        edge_index: torch.Tensor,
-        size: Optional[Tuple[int, int]] = None,
-        edge_attr: Optional[torch.Tensor] = None,
-    ) -> Union[CSC, Tuple[CSC, torch.Tensor],]:
-        r"""Returns a CSC representation of an :obj:`edge_index` tensor to be
-        used as input to cugraph-ops conv layers.
-
-        Args:
-            edge_index (torch.Tensor): The edge indices.
-            size ((int, int), optional). The shape of :obj:`edge_index` in each
-                dimension. (default: :obj:`None`)
-            edge_attr (torch.Tensor, optional): The edge features.
-                (default: :obj:`None`)
-        """
-        if size is None:
-            warnings.warn(
-                f"Inferring the graph size from 'edge_index' causes "
-                f"a decline in performance and does not work for "
-                f"bipartite graphs. To suppress this warning, pass "
-                f"the 'size' explicitly in '{__name__}.to_csc()'."
-            )
-            num_src_nodes = num_dst_nodes = int(edge_index.max()) + 1
-        else:
-            num_src_nodes, num_dst_nodes = size
-
-        row, col = edge_index
-        col, perm = torch_geometric.utils.index_sort(col, max_value=num_dst_nodes)
-        row = row[perm]
-
-        colptr = torch_geometric.utils.sparse.index2ptr(col, num_dst_nodes)
-
-        if edge_attr is not None:
-            return (row, colptr, num_src_nodes), edge_attr[perm]
-
-        return row, colptr, num_src_nodes
-
-    def get_cugraph(
-        self,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        bipartite: bool = False,
-        max_num_neighbors: Optional[int] = None,
-    ) -> Tuple[pylibcugraphops.pytorch.CSC, Optional[torch.Tensor]]:
-        r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
-        Supports both bipartite and non-bipartite graphs.
-
-        Args:
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-            bipartite (bool): If set to :obj:`True`, will create the bipartite
-                structure in cugraph-ops. (default: :obj:`False`)
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-        """
-        perm = None
-        if isinstance(edge_index, torch_geometric.EdgeIndex):
-            edge_index, perm = edge_index.sort_by("col")
-            num_src_nodes = edge_index.get_sparse_size(0)
-            (colptr, row), _ = edge_index.get_csc()
-        else:
-            row, colptr, num_src_nodes = edge_index
-
-        if not row.is_cuda:
-            raise RuntimeError(
-                f"'{self.__class__.__name__}' requires GPU-based processing "
-                f"but got CPU tensor."
-            )
-
-        if max_num_neighbors is None:
-            max_num_neighbors = -1
-
-        return (
-            pylibcugraphops.pytorch.CSC(
-                offsets=colptr,
-                indices=row,
-                num_src_nodes=num_src_nodes,
-                dst_max_in_degree=max_num_neighbors,
-                is_bipartite=bipartite,
-            ),
-            perm,
-        )
-
-    def get_typed_cugraph(
-        self,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_type: torch.Tensor,
-        num_edge_types: Optional[int] = None,
-        bipartite: bool = False,
-        max_num_neighbors: Optional[int] = None,
-    ) -> Tuple[pylibcugraphops.pytorch.HeteroCSC, Optional[torch.Tensor]]:
-        r"""Constructs a typed :obj:`cugraph` graph object from a CSC
-        representation where each edge corresponds to a given edge type.
-        Supports both bipartite and non-bipartite graphs.
-
-        Args:
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-            edge_type (torch.Tensor): The edge type.
-            num_edge_types (int, optional): The maximum number of edge types.
-                When not given, will be computed on-the-fly, leading to
-                slightly worse performance. (default: :obj:`None`)
-            bipartite (bool): If set to :obj:`True`, will create the bipartite
-                structure in cugraph-ops. (default: :obj:`False`)
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-        """
-        if num_edge_types is None:
-            num_edge_types = int(edge_type.max()) + 1
-
-        if max_num_neighbors is None:
-            max_num_neighbors = -1
-
-        perm = None
-        if isinstance(edge_index, torch_geometric.EdgeIndex):
-            edge_index, perm = edge_index.sort_by("col")
-            edge_type = edge_type[perm]
-            num_src_nodes = edge_index.get_sparse_size(0)
-            (colptr, row), _ = edge_index.get_csc()
-        else:
-            row, colptr, num_src_nodes = edge_index
-        edge_type = edge_type.int()
-
-        return (
-            pylibcugraphops.pytorch.HeteroCSC(
-                offsets=colptr,
-                indices=row,
-                edge_types=edge_type,
-                num_src_nodes=num_src_nodes,
-                num_edge_types=num_edge_types,
-                dst_max_in_degree=max_num_neighbors,
-                is_bipartite=bipartite,
-            ),
-            perm,
-        )
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor): The node features.
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-        """
-        raise NotImplementedError
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
deleted file mode 100644
index 981b1c5b50d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class GATConv(BaseConv):
-    r"""The graph attentional operator from the `"Graph Attention Networks"
-    <https://arxiv.org/abs/1710.10903>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
-        \right)\right)}.
-
-    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
-    the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j
-        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,j}]\right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k
-        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,k}]\right)\right)}.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        negative_slope (float, optional): LeakyReLU angle of the negative
-            slope. (default: :obj:`0.2`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). (default: :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.concat = concat
-        self.negative_slope = negative_slope
-        self.edge_dim = edge_dim
-
-        Linear = torch_geometric.nn.Linear
-
-        if isinstance(in_channels, int):
-            self.lin = Linear(
-                in_channels,
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-        else:
-            self.lin_src = Linear(
-                in_channels[0],
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-            self.lin_dst = Linear(
-                in_channels[1],
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-
-        if edge_dim is not None:
-            self.lin_edge = Linear(
-                edge_dim,
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-            self.att = nn.Parameter(torch.Tensor(3 * heads * out_channels))
-        else:
-            self.register_parameter("lin_edge", None)
-            self.att = nn.Parameter(torch.Tensor(2 * heads * out_channels))
-
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if isinstance(self.in_channels, int):
-            self.lin.reset_parameters()
-        else:
-            self.lin_src.reset_parameters()
-            self.lin_dst.reset_parameters()
-
-        torch_geometric.nn.inits.glorot(
-            self.att.view(-1, self.heads, self.out_channels)
-        )
-
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-        max_num_neighbors: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-        high_precision_dgrad: bool = False,
-        high_precision_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-            deterministic_dgrad : bool, default=False
-                Optional flag indicating whether the feature gradients
-                are computed deterministically using a dedicated workspace buffer.
-            deterministic_wgrad: bool, default=False
-                Optional flag indicating whether the weight gradients
-                are computed deterministically using a dedicated workspace buffer.
-            high_precision_dgrad: bool, default=False
-                Optional flag indicating whether gradients for inputs in half precision
-                are kept in single precision as long as possible and only casted to
-                the corresponding input type at the very end.
-            high_precision_wgrad: bool, default=False
-                Optional flag indicating whether gradients for weights in half precision
-                are kept in single precision as long as possible and only casted to
-                the corresponding input type at the very end.
-        """
-        bipartite = not isinstance(x, torch.Tensor)
-        graph, perm = self.get_cugraph(
-            edge_index=edge_index,
-            bipartite=bipartite,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        if deterministic_dgrad:
-            graph.add_reverse_graph()
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if edge_attr.dim() == 1:
-                edge_attr = edge_attr.view(-1, 1)
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        if bipartite:
-            if not hasattr(self, "lin_src"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_channels must be a pair of "
-                    f"integers to allow bipartite node features, but got "
-                    f"{self.in_channels}."
-                )
-            x_src = self.lin_src(x[0])
-            x_dst = self.lin_dst(x[1])
-        else:
-            if not hasattr(self, "lin"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_channels is expected to be an "
-                    f"integer, but got {self.in_channels}."
-                )
-            x = self.lin(x)
-
-        out = mha_gat_n2n(
-            (x_src, x_dst) if bipartite else x,
-            self.att,
-            graph,
-            num_heads=self.heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=edge_attr,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-            high_precision_dgrad=high_precision_dgrad,
-            high_precision_wgrad=high_precision_wgrad,
-        )
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
deleted file mode 100644
index ebb30de9754..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class GATv2Conv(BaseConv):
-    r"""The GATv2 operator from the `"How Attentive are Graph Attention
-    Networks?" <https://arxiv.org/abs/2105.14491>`_ paper, which fixes the
-    static attention problem of the standard
-    :class:`~torch_geometric.conv.GATConv` layer.
-    Since the linear layers in the standard GAT are applied right after each
-    other, the ranking of attended nodes is unconditioned on the query node.
-    In contrast, in :class:`GATv2`, every node can attend to any other node.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k]
-        \right)\right)}.
-
-    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
-    the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j \, \Vert \, \mathbf{e}_{i,j}]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k \, \Vert \, \mathbf{e}_{i,k}]
-        \right)\right)}.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        negative_slope (float, optional): LeakyReLU angle of the negative
-            slope. (default: :obj:`0.2`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). (default: :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-        share_weights (bool, optional): If set to :obj:`True`, the same matrix
-            will be applied to the source and the target node of every edge.
-            (default: :obj:`False`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-        share_weights: bool = False,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.concat = concat
-        self.negative_slope = negative_slope
-        self.edge_dim = edge_dim
-        self.share_weights = share_weights
-
-        Linear = torch_geometric.nn.Linear
-
-        if isinstance(in_channels, int):
-            self.lin_src = Linear(
-                in_channels,
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-
-            if share_weights:
-                self.lin_dst = self.lin_src
-            else:
-                self.lin_dst = Linear(
-                    in_channels,
-                    heads * out_channels,
-                    bias=bias,
-                    weight_initializer="glorot",
-                )
-        else:
-            self.lin_src = Linear(
-                in_channels[0],
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-            self.lin_dst = Linear(
-                in_channels[1],
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-
-        self.att = nn.Parameter(torch.Tensor(heads * out_channels))
-
-        if edge_dim is not None:
-            self.lin_edge = Linear(
-                edge_dim, heads * out_channels, bias=False, weight_initializer="glorot"
-            )
-        else:
-            self.register_parameter("lin_edge", None)
-
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_src.reset_parameters()
-        self.lin_dst.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        torch_geometric.nn.inits.glorot(
-            self.att.view(-1, self.heads, self.out_channels)
-        )
-
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-            deterministic_dgrad : bool, default=False
-                Optional flag indicating whether the feature gradients
-                are computed deterministically using a dedicated workspace buffer.
-            deterministic_wgrad: bool, default=False
-                Optional flag indicating whether the weight gradients
-                are computed deterministically using a dedicated workspace buffer.
-        """
-        bipartite = not isinstance(x, torch.Tensor) or not self.share_weights
-        graph, perm = self.get_cugraph(edge_index, bipartite=bipartite)
-        if deterministic_dgrad:
-            graph.add_reverse_graph()
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if edge_attr.dim() == 1:
-                edge_attr = edge_attr.view(-1, 1)
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        if bipartite:
-            if isinstance(x, torch.Tensor):
-                x = (x, x)
-            x_src = self.lin_src(x[0])
-            x_dst = self.lin_dst(x[1])
-        else:
-            x = self.lin_src(x)
-
-        out = mha_gat_v2_n2n(
-            (x_src, x_dst) if bipartite else x,
-            self.att,
-            graph,
-            num_heads=self.heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=edge_attr,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-        )
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
deleted file mode 100644
index a73dd8e57ff..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-from collections import defaultdict
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_n2n
-
-from .base import BaseConv
-from cugraph_pyg.utils.imports import package_available
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class HeteroGATConv(BaseConv):
-    r"""The graph attentional operator on heterogeneous graphs, where a separate
-    `GATConv` is applied on the homogeneous graph for each edge type. Compared
-    with directly wrapping `GATConv`s with `HeteroConv`, `HeteroGATConv` fuses
-    all the linear transformation associated with each node type together into 1
-    GEMM call, to improve the performance on GPUs.
-
-    Parameters
-    ----------
-    in_channels : int or Dict[str, int])
-        Size of each input sample of every node type.
-
-    out_channels : int
-        Size of each output sample.
-
-    node_types : List[str]
-        List of Node types.
-
-    edge_types : List[Tuple[str, str, str]]
-        List of Edge types.
-
-    heads : int, optional (default=1)
-        Number of multi-head-attentions.
-
-    concat : bool, optional (default=True):
-        If set to :obj:`False`, the multi-head attentions are averaged instead
-        of concatenated.
-
-    negative_slope : float, optional (default=0.2)
-        LeakyReLU angle of the negative slope.
-
-    bias : bool, optional (default=True)
-        If set to :obj:`False`, the layer will not learn an additive bias.
-
-    aggr : str, optional (default="sum")
-        The aggregation scheme to use for grouping node embeddings generated by
-        different relations. Choose from "sum", "mean", "min", "max".
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, dict[str, int]],
-        out_channels: int,
-        node_types: list[str],
-        edge_types: list[tuple[str, str, str]],
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        bias: bool = True,
-        aggr: str = "sum",
-    ):
-        if not package_available("torch_geometric>=2.4.0"):
-            raise RuntimeError(
-                f"{self.__class__.__name__} requires torch_geometric>=2.4.0."
-            )
-
-        super().__init__()
-
-        if isinstance(in_channels, int):
-            in_channels = dict.fromkeys(node_types, in_channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-
-        self.node_types = node_types
-        self.edge_types = edge_types
-        self.num_heads = heads
-        self.concat_heads = concat
-
-        self.negative_slope = negative_slope
-        self.aggr = aggr
-
-        self.relations_per_ntype = defaultdict(lambda: ([], []))
-
-        lin_weights = dict.fromkeys(self.node_types)
-        attn_weights = dict.fromkeys(self.edge_types)
-        biases = dict.fromkeys(self.edge_types)
-
-        ParameterDict = torch_geometric.nn.parameter_dict.ParameterDict
-
-        for edge_type in self.edge_types:
-            src_type, _, dst_type = edge_type
-            self.relations_per_ntype[src_type][0].append(edge_type)
-            if src_type != dst_type:
-                self.relations_per_ntype[dst_type][1].append(edge_type)
-
-            attn_weights[edge_type] = torch.empty(
-                2 * self.num_heads * self.out_channels
-            )
-
-            if bias and concat:
-                biases[edge_type] = torch.empty(self.num_heads * out_channels)
-            elif bias:
-                biases[edge_type] = torch.empty(out_channels)
-            else:
-                biases[edge_type] = None
-
-        for ntype in self.node_types:
-            n_src_rel = len(self.relations_per_ntype[ntype][0])
-            n_dst_rel = len(self.relations_per_ntype[ntype][1])
-            n_rel = n_src_rel + n_dst_rel
-
-            lin_weights[ntype] = torch.empty(
-                (n_rel * self.num_heads * self.out_channels, self.in_channels[ntype])
-            )
-
-        self.lin_weights = ParameterDict(lin_weights)
-        self.attn_weights = ParameterDict(attn_weights)
-
-        if bias:
-            self.bias = ParameterDict(biases)
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def split_tensors(
-        self, x_fused_dict: dict[str, torch.Tensor], dim: int
-    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
-        """Split fused tensors into chunks based on edge types.
-
-        Parameters
-        ----------
-        x_fused_dict : dict[str, torch.Tensor]
-            A dictionary to hold node feature for each node type. The key is
-            node type; the value is a fused tensor that account for all
-            relations for that node type.
-
-        dim : int
-            Dimension along which to split the fused tensor.
-
-        Returns
-        -------
-        x_src_dict : dict[str, torch.Tensor]
-            A dictionary to hold source node feature for each relation graph.
-
-        x_dst_dict : dict[str, torch.Tensor]
-            A dictionary to hold destination node feature for each relation graph.
-        """
-        x_src_dict = dict.fromkeys(self.edge_types)
-        x_dst_dict = dict.fromkeys(self.edge_types)
-
-        for ntype, t in x_fused_dict.items():
-            n_src_rel = len(self.relations_per_ntype[ntype][0])
-            n_dst_rel = len(self.relations_per_ntype[ntype][1])
-            n_rel = n_src_rel + n_dst_rel
-            t_list = torch.chunk(t, chunks=n_rel, dim=dim)
-
-            for i, src_rel in enumerate(self.relations_per_ntype[ntype][0]):
-                x_src_dict[src_rel] = t_list[i]
-
-            for i, dst_rel in enumerate(self.relations_per_ntype[ntype][1]):
-                x_dst_dict[dst_rel] = t_list[i + n_src_rel]
-
-        return x_src_dict, x_dst_dict
-
-    def reset_parameters(self, seed: Optional[int] = None):
-        if seed is not None:
-            torch.manual_seed(seed)
-
-        w_src, w_dst = self.split_tensors(self.lin_weights, dim=0)
-
-        for edge_type in self.edge_types:
-            src_type, _, dst_type = edge_type
-
-            # lin_src
-            torch_geometric.nn.inits.glorot(w_src[edge_type])
-
-            # lin_dst
-            if src_type != dst_type:
-                torch_geometric.nn.inits.glorot(w_dst[edge_type])
-
-            # attn_weights
-            torch_geometric.nn.inits.glorot(
-                self.attn_weights[edge_type].view(-1, self.num_heads, self.out_channels)
-            )
-
-            # bias
-            if self.bias is not None:
-                torch_geometric.nn.inits.zeros(self.bias[edge_type])
-
-    def forward(
-        self,
-        x_dict: dict[str, torch.Tensor],
-        edge_index_dict: dict[tuple[str, str, str], torch.Tensor],
-    ) -> dict[str, torch.Tensor]:
-        feat_dict = dict.fromkeys(x_dict.keys())
-
-        for ntype, x in x_dict.items():
-            feat_dict[ntype] = x @ self.lin_weights[ntype].T
-
-        x_src_dict, x_dst_dict = self.split_tensors(feat_dict, dim=1)
-
-        out_dict = defaultdict(list)
-
-        for edge_type, edge_index in edge_index_dict.items():
-            src_type, _, dst_type = edge_type
-
-            csc = BaseConv.to_csc(
-                edge_index, (x_dict[src_type].size(0), x_dict[dst_type].size(0))
-            )
-
-            if src_type == dst_type:
-                graph, _ = self.get_cugraph(
-                    csc,
-                    bipartite=False,
-                )
-                out = mha_gat_n2n(
-                    x_src_dict[edge_type],
-                    self.attn_weights[edge_type],
-                    graph,
-                    num_heads=self.num_heads,
-                    activation="LeakyReLU",
-                    negative_slope=self.negative_slope,
-                    concat_heads=self.concat_heads,
-                )
-
-            else:
-                graph, _ = self.get_cugraph(
-                    csc,
-                    bipartite=True,
-                )
-                out = mha_gat_n2n(
-                    (x_src_dict[edge_type], x_dst_dict[edge_type]),
-                    self.attn_weights[edge_type],
-                    graph,
-                    num_heads=self.num_heads,
-                    activation="LeakyReLU",
-                    negative_slope=self.negative_slope,
-                    concat_heads=self.concat_heads,
-                )
-
-            if self.bias is not None:
-                out = out + self.bias[edge_type]
-
-            out_dict[dst_type].append(out)
-
-        for key, value in out_dict.items():
-            out_dict[key] = torch_geometric.nn.conv.hetero_conv.group(value, self.aggr)
-
-        return out_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
deleted file mode 100644
index 13fa08db5c5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import agg_hg_basis_n2n_post
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class RGCNConv(BaseConv):  # pragma: no cover
-    r"""The relational graph convolutional operator from the `"Modeling
-    Relational Data with Graph Convolutional Networks"
-    <https://arxiv.org/abs/1703.06103>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{\Theta}_{\textrm{root}} \cdot
-        \mathbf{x}_i + \sum_{r \in \mathcal{R}} \sum_{j \in \mathcal{N}_r(i)}
-        \frac{1}{|\mathcal{N}_r(i)|} \mathbf{\Theta}_r \cdot \mathbf{x}_j,
-
-    where :math:`\mathcal{R}` denotes the set of relations, *i.e.* edge types.
-    Edge type needs to be a one-dimensional :obj:`torch.long` tensor which
-    stores a relation identifier
-    :math:`\in \{ 0, \ldots, |\mathcal{R}| - 1\}` for each edge.
-
-    Args:
-        in_channels (int): Size of each input sample.
-        out_channels (int): Size of each output sample.
-        num_relations (int): Number of relations.
-        num_bases (int, optional): If set, this layer will use the
-            basis-decomposition regularization scheme where :obj:`num_bases`
-            denotes the number of bases to use. (default: :obj:`None`)
-        aggr (str, optional): The aggregation scheme to use
-            (:obj:`"add"`, :obj:`"mean"`, :obj:`"sum"`).
-            (default: :obj:`"mean"`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add transformed root node features to the output.
-            (default: :obj:`True`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_relations: int,
-        num_bases: Optional[int] = None,
-        aggr: str = "mean",
-        root_weight: bool = True,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggr not in ["mean", "sum", "add"]:
-            raise ValueError(
-                f"Aggregation function must be chosen from 'mean', 'sum' or "
-                f"'add', but got '{aggr}'."
-            )
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.num_relations = num_relations
-        self.num_bases = num_bases
-        self.aggr = aggr
-        self.root_weight = root_weight
-
-        dim_root_weight = 1 if root_weight else 0
-
-        if num_bases is not None:
-            self.weight = torch.nn.Parameter(
-                torch.empty(num_bases + dim_root_weight, in_channels, out_channels)
-            )
-            self.comp = torch.nn.Parameter(torch.empty(num_relations, num_bases))
-        else:
-            self.weight = torch.nn.Parameter(
-                torch.empty(num_relations + dim_root_weight, in_channels, out_channels)
-            )
-            self.register_parameter("comp", None)
-
-        if bias:
-            self.bias = torch.nn.Parameter(torch.empty(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        end = -1 if self.root_weight else None
-        torch_geometric.nn.inits.glorot(self.weight[:end])
-        torch_geometric.nn.inits.glorot(self.comp)
-        if self.root_weight:
-            torch_geometric.nn.inits.glorot(self.weight[-1])
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_type: torch.Tensor,
-        max_num_neighbors: Optional[int] = None,
-    ) -> torch.Tensor:
-
-        graph, _ = self.get_typed_cugraph(
-            edge_index,
-            edge_type,
-            self.num_relations,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        out = agg_hg_basis_n2n_post(
-            x,
-            self.comp,
-            graph,
-            concat_own=self.root_weight,
-            norm_by_out_degree=bool(self.aggr == "mean"),
-        )
-
-        out = out @ self.weight.view(-1, self.out_channels)
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, num_relations={self.num_relations})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
deleted file mode 100644
index 65dc99d8988..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import agg_concat_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class SAGEConv(BaseConv):
-    r"""The GraphSAGE operator from the `"Inductive Representation Learning on
-    Large Graphs" <https://arxiv.org/abs/1706.02216>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + \mathbf{W}_2 \cdot
-        \mathrm{mean}_{j \in \mathcal{N(i)}} \mathbf{x}_j
-
-    If :obj:`project = True`, then :math:`\mathbf{x}_j` will first get
-    projected via
-
-    .. math::
-        \mathbf{x}_j \leftarrow \sigma ( \mathbf{W}_3 \mathbf{x}_j +
-        \mathbf{b})
-
-    as described in Eq. (3) of the paper.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample. A tuple
-            corresponds to the sizes of source and target dimensionalities.
-        out_channels (int): Size of each output sample.
-        aggr (str or Aggregation, optional): The aggregation scheme to use.
-            Choose from :obj:`"mean"`, :obj:`"sum"`, :obj:`"min"` or
-            :obj:`"max"`. (default: :obj:`"mean"`)
-        normalize (bool, optional): If set to :obj:`True`, output features
-            will be :math:`\ell_2`-normalized, *i.e.*,
-            :math:`\frac{\mathbf{h}_i^{k+1}}
-            {\| \mathbf{h}_i^{k+1} \|_2}`.
-            (default: :obj:`False`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add transformed root node features to the output.
-            (default: :obj:`True`)
-        project (bool, optional): If set to :obj:`True`, the layer will apply a
-            linear transformation followed by an activation function before
-            aggregation (as described in Eq. (3) of the paper).
-            (default: :obj:`False`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        aggr: str = "mean",
-        normalize: bool = False,
-        root_weight: bool = True,
-        project: bool = False,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggr not in ["mean", "sum", "min", "max"]:
-            raise ValueError(
-                f"Aggregation function must be chosen from 'mean',"
-                f" 'sum', 'min' or 'max', but got '{aggr}'."
-            )
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.aggr = aggr
-        self.normalize = normalize
-        self.root_weight = root_weight
-        self.project = project
-
-        if isinstance(in_channels, int):
-            self.in_channels_src = self.in_channels_dst = in_channels
-        else:
-            self.in_channels_src, self.in_channels_dst = in_channels
-
-        if self.project:
-            self.pre_lin = torch_geometric.nn.Linear(
-                self.in_channels_src, self.in_channels_src, bias=True
-            )
-
-        if self.root_weight:
-            self.lin = torch_geometric.nn.Linear(
-                self.in_channels_src + self.in_channels_dst, out_channels, bias=bias
-            )
-        else:
-            self.lin = torch_geometric.nn.Linear(
-                self.in_channels_src, out_channels, bias=bias
-            )
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if self.project:
-            self.pre_lin.reset_parameters()
-        self.lin.reset_parameters()
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        max_num_neighbors: Optional[int] = None,
-    ) -> torch.Tensor:
-        bipartite = isinstance(x, Tuple)
-        graph, _ = self.get_cugraph(
-            edge_index=edge_index,
-            bipartite=bipartite,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        if self.project:
-            if bipartite:
-                x = (self.pre_lin(x[0]).relu(), x[1])
-            else:
-                x = self.pre_lin(x).relu()
-
-        out = agg_concat_n2n(x, graph, self.aggr)
-
-        if self.root_weight:
-            out = self.lin(out)
-        else:
-            out = self.lin(out[:, : self.in_channels_src])
-
-        if self.normalize:
-            out = torch.nn.functional.normalize(out, p=2.0, dim=-1)
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, aggr={self.aggr})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
deleted file mode 100644
index e184ee0e893..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_simple_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class TransformerConv(BaseConv):
-    r"""The graph transformer operator from the `"Masked Label Prediction:
-    Unified Message Passing Model for Semi-Supervised Classification"
-    <https://arxiv.org/abs/2009.03509>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \mathbf{W}_2 \mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed via
-    multi-head dot product attention:
-
-    .. math::
-        \alpha_{i,j} = \textrm{softmax} \left(
-        \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top} (\mathbf{W}_4\mathbf{x}_j)}
-        {\sqrt{d}} \right)
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        beta (bool, optional): If set, will combine aggregation and
-            skip information via
-
-            .. math::
-                \mathbf{x}^{\prime}_i = \beta_i \mathbf{W}_1 \mathbf{x}_i +
-                (1 - \beta_i) \underbrace{\left(\sum_{j \in \mathcal{N}(i)}
-                \alpha_{i,j} \mathbf{W}_2 \vec{x}_j \right)}_{=\mathbf{m}_i}
-
-            with :math:`\beta_i = \textrm{sigmoid}(\mathbf{w}_5^{\top}
-            [ \mathbf{W}_1 \mathbf{x}_i, \mathbf{m}_i, \mathbf{W}_1
-            \mathbf{x}_i - \mathbf{m}_i ])` (default: :obj:`False`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). Edge features are added to the keys after
-            linear transformation, that is, prior to computing the
-            attention dot product. They are also added to final values
-            after the same linear transformation. The model is:
-
-            .. math::
-                \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
-                \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \left(
-                \mathbf{W}_2 \mathbf{x}_{j} + \mathbf{W}_6 \mathbf{e}_{ij}
-                \right),
-
-            where the attention coefficients :math:`\alpha_{i,j}` are now
-            computed via:
-
-            .. math::
-                \alpha_{i,j} = \textrm{softmax} \left(
-                \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top}
-                (\mathbf{W}_4\mathbf{x}_j + \mathbf{W}_6 \mathbf{e}_{ij})}
-                {\sqrt{d}} \right)
-
-            (default :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add the transformed root node features to the output and the
-            option  :attr:`beta` is set to :obj:`False`. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        beta: bool = False,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-        root_weight: bool = True,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.beta = beta and root_weight
-        self.root_weight = root_weight
-        self.concat = concat
-        self.edge_dim = edge_dim
-
-        if isinstance(in_channels, int):
-            in_channels = (in_channels, in_channels)
-
-        Linear = torch_geometric.nn.Linear
-
-        self.lin_key = Linear(in_channels[0], heads * out_channels)
-        self.lin_query = Linear(in_channels[1], heads * out_channels)
-        self.lin_value = Linear(in_channels[0], heads * out_channels)
-        if edge_dim is not None:
-            self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False)
-        else:
-            self.lin_edge = self.register_parameter("lin_edge", None)
-
-        if concat:
-            self.lin_skip = Linear(in_channels[1], heads * out_channels, bias=bias)
-            if self.beta:
-                self.lin_beta = Linear(3 * heads * out_channels, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-        else:
-            self.lin_skip = Linear(in_channels[1], out_channels, bias=bias)
-            if self.beta:
-                self.lin_beta = Linear(3 * out_channels, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_key.reset_parameters()
-        self.lin_query.reset_parameters()
-        self.lin_value.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-        self.lin_skip.reset_parameters()
-        if self.lin_beta is not None:
-            self.lin_beta.reset_parameters()
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-        """
-        bipartite = True
-        graph, perm = self.get_cugraph(edge_index=edge_index, bipartite=bipartite)
-
-        if isinstance(x, torch.Tensor):
-            x = (x, x)
-
-        query = self.lin_query(x[1])
-        key = self.lin_key(x[0])
-        value = self.lin_value(x[0])
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        out = mha_simple_n2n(
-            key,
-            query,
-            value,
-            graph,
-            self.heads,
-            self.concat,
-            edge_emb=edge_attr,
-            norm_by_dim=True,
-            score_bias=None,
-        )
-
-        if self.root_weight:
-            x_r = self.lin_skip(x[1])
-            if self.lin_beta is not None:
-                beta = self.lin_beta(torch.cat([out, x_r, out - x_r], dim=-1))
-                beta = beta.sigmoid()
-                out = beta * x_r + (1 - beta) * out
-            else:
-                out = out + x_r
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
deleted file mode 100644
index 34fe9c4463e..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph_pyg.sampler.sampler import BaseSampler, SampleIterator
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
deleted file mode 100644
index bc3d4fd8d3c..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Iterator, Union, Dict, Tuple
-
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import DistSampler
-
-from .sampler_utils import filter_cugraph_pyg_store, neg_sample, neg_cat
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class SampleIterator:
-    """
-    Iterator that combines output graphs with their
-    features to produce final output minibatches
-    that can be fed into a GNN model.
-    """
-
-    def __init__(
-        self,
-        data: Tuple[
-            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-        ],
-        output_iter: Iterator[
-            Union[
-                "torch_geometric.sampler.HeteroSamplerOutput",
-                "torch_geometric.sampler.SamplerOutput",
-            ]
-        ],
-    ):
-        """
-        Constructs a new SampleIterator
-
-        Parameters
-        ----------
-        data: Tuple[torch_geometric.data.FeatureStore, torch_geometric.data.GraphStore]
-            The original graph that samples were generated from, as a
-            FeatureStore/GraphStore tuple.
-        output_iter: Iterator[Union["torch_geometric.sampler.HeteroSamplerOutput",
-        "torch_geometric.sampler.SamplerOutput"]]
-            An iterator over outputted sampling results.
-        """
-        self.__feature_store, self.__graph_store = data
-        self.__output_iter = output_iter
-
-    def __next__(self):
-        next_sample = next(self.__output_iter)
-        if isinstance(next_sample, torch_geometric.sampler.SamplerOutput):
-            sz = next_sample.edge.numel()
-            if sz == next_sample.col.numel() and (
-                next_sample.node.numel() > next_sample.col[-1]
-            ):
-                # This will only trigger on very small batches and will have minimal
-                # performance impact.  If COO output is removed, then this condition
-                # can be avoided.
-                col = next_sample.col
-            else:
-                col = torch_geometric.edge_index.ptr2index(
-                    next_sample.col, next_sample.edge.numel()
-                )
-
-            data = filter_cugraph_pyg_store(
-                self.__feature_store,
-                self.__graph_store,
-                next_sample.node,
-                next_sample.row,
-                col,
-                next_sample.edge,
-                None,
-            )
-
-            """
-            # TODO Re-enable this once PyG resolves
-            # the issue with edge features (9566)
-            data = torch_geometric.loader.utils.filter_custom_store(
-                self.__feature_store,
-                self.__graph_store,
-                next_sample.node,
-                next_sample.row,
-                col,
-                next_sample.edge,
-                None,
-            )
-            """
-
-            if "n_id" not in data:
-                data.n_id = next_sample.node
-            if next_sample.edge is not None and "e_id" not in data:
-                edge = next_sample.edge.to(torch.long)
-                data.e_id = edge
-
-            data.batch = next_sample.batch
-            data.num_sampled_nodes = next_sample.num_sampled_nodes
-            data.num_sampled_edges = next_sample.num_sampled_edges
-
-            data.input_id = next_sample.metadata[0]
-            data.batch_size = data.input_id.size(0)
-
-            if len(next_sample.metadata) == 2:
-                data.seed_time = next_sample.metadata[1]
-            elif len(next_sample.metadata) == 4:
-                (
-                    data.edge_label_index,
-                    data.edge_label,
-                    data.seed_time,
-                ) = next_sample.metadata[1:]
-            else:
-                raise ValueError("Invalid metadata")
-
-        elif isinstance(next_sample, torch_geometric.sampler.HeteroSamplerOutput):
-            col = {}
-            for edge_type, col_idx in next_sample.col:
-                sz = next_sample.edge[edge_type].numel()
-                if sz == col_idx.numel():
-                    col[edge_type] = col_idx
-                else:
-                    col[edge_type] = torch_geometric.edge_index.ptr2index(col_idx, sz)
-
-            data = torch_geometric.loader.utils.filter_custom_hetero_store(
-                self.__feature_store,
-                self.__graph_store,
-                next_sample.node,
-                next_sample.row,
-                col,
-                next_sample.edge,
-                None,
-            )
-
-            for key, node in next_sample.node.items():
-                if "n_id" not in data[key]:
-                    data[key].n_id = node
-
-            for key, edge in (next_sample.edge or {}).items():
-                if edge is not None and "e_id" not in data[key]:
-                    edge = edge.to(torch.long)
-                    data[key].e_id = edge
-
-            data.set_value_dict("batch", next_sample.batch)
-            data.set_value_dict("num_sampled_nodes", next_sample.num_sampled_nodes)
-            data.set_value_dict("num_sampled_edges", next_sample.num_sampled_edges)
-
-            # TODO figure out how to set input_id for heterogeneous output
-        else:
-            raise ValueError("Invalid output type")
-
-        return data
-
-    def __iter__(self):
-        return self
-
-
-class SampleReader:
-    """
-    Iterator that processes results from the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self, base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-    ):
-        """
-        Constructs a new SampleReader.
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The reader responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        """
-        self.__base_reader = base_reader
-        self.__num_samples_remaining = 0
-        self.__index = 0
-
-    def __next__(self):
-        if self.__num_samples_remaining == 0:
-            # raw_sample_data is already a dict of tensors
-            self.__raw_sample_data, start_inclusive, end_inclusive = next(
-                self.__base_reader
-            )
-
-            self.__raw_sample_data["input_offsets"] -= self.__raw_sample_data[
-                "input_offsets"
-            ][0].clone()
-            self.__raw_sample_data["label_hop_offsets"] -= self.__raw_sample_data[
-                "label_hop_offsets"
-            ][0].clone()
-            self.__raw_sample_data["renumber_map_offsets"] -= self.__raw_sample_data[
-                "renumber_map_offsets"
-            ][0].clone()
-            if "major_offsets" in self.__raw_sample_data:
-                self.__raw_sample_data["major_offsets"] -= self.__raw_sample_data[
-                    "major_offsets"
-                ][0].clone()
-
-            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
-            self.__index = 0
-
-        out = self._decode(self.__raw_sample_data, self.__index)
-        self.__index += 1
-        self.__num_samples_remaining -= 1
-        return out
-
-    def __iter__(self):
-        return self
-
-
-class HomogeneousSampleReader(SampleReader):
-    """
-    Subclass of SampleReader that reads homogeneous output samples
-    produced by the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self, base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-    ):
-        """
-        Constructs a new HomogeneousSampleReader
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The iterator responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        """
-        super().__init__(base_reader)
-
-    def __decode_csc(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
-        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
-            raw_sample_data["renumber_map_offsets"].numel() - 1
-        )
-
-        major_offsets_start_incl = raw_sample_data["label_hop_offsets"][
-            index * fanout_length
-        ]
-        major_offsets_end_incl = raw_sample_data["label_hop_offsets"][
-            (index + 1) * fanout_length
-        ]
-
-        major_offsets = raw_sample_data["major_offsets"][
-            major_offsets_start_incl : major_offsets_end_incl + 1
-        ].clone()
-        minors = raw_sample_data["minors"][major_offsets[0] : major_offsets[-1]]
-        edge_id = raw_sample_data["edge_id"][major_offsets[0] : major_offsets[-1]]
-        # don't retrieve edge type for a homogeneous graph
-
-        major_offsets -= major_offsets[0].clone()
-
-        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
-        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
-
-        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
-
-        current_label_hop_offsets = raw_sample_data["label_hop_offsets"][
-            index * fanout_length : (index + 1) * fanout_length + 1
-        ].clone()
-        current_label_hop_offsets -= current_label_hop_offsets[0].clone()
-
-        num_sampled_edges = major_offsets[current_label_hop_offsets].diff()
-
-        num_sampled_nodes_hops = torch.tensor(
-            [
-                minors[: num_sampled_edges[:i].sum()].max() + 1
-                for i in range(1, fanout_length + 1)
-            ],
-            device="cpu",
-        )
-
-        num_seeds = (
-            torch.searchsorted(major_offsets, num_sampled_edges[0]).reshape((1,)).cpu()
-        )
-        num_sampled_nodes = torch.concat(
-            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
-        )
-
-        input_index = raw_sample_data["input_index"][
-            raw_sample_data["input_offsets"][index] : raw_sample_data["input_offsets"][
-                index + 1
-            ]
-        ]
-
-        num_seeds = input_index.numel()
-        input_index = input_index[input_index >= 0]
-
-        num_pos = input_index.numel()
-        num_neg = num_seeds - num_pos
-        if num_neg > 0:
-            edge_label = torch.concat(
-                [
-                    torch.full((num_pos,), 1.0),
-                    torch.full((num_neg,), 0.0),
-                ]
-            )
-        else:
-            edge_label = None
-
-        edge_inverse = (
-            (
-                raw_sample_data["edge_inverse"][
-                    (raw_sample_data["input_offsets"][index] * 2) : (
-                        raw_sample_data["input_offsets"][index + 1] * 2
-                    )
-                ]
-            )
-            if "edge_inverse" in raw_sample_data
-            else None
-        )
-
-        if edge_inverse is None:
-            metadata = (
-                input_index,
-                None,  # TODO this will eventually include time
-            )
-        else:
-            metadata = (
-                input_index,
-                edge_inverse.view(2, -1),
-                edge_label,
-                None,  # TODO this will eventually include time
-            )
-
-        return torch_geometric.sampler.SamplerOutput(
-            node=renumber_map.cpu(),
-            row=minors,
-            col=major_offsets,
-            edge=edge_id.cpu(),
-            batch=renumber_map[:num_seeds],
-            num_sampled_nodes=num_sampled_nodes.cpu(),
-            num_sampled_edges=num_sampled_edges.cpu(),
-            metadata=metadata,
-        )
-
-    def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
-        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
-            raw_sample_data["renumber_map_offsets"].numel() - 1
-        )
-
-        major_minor_start = raw_sample_data["label_hop_offsets"][index * fanout_length]
-        ix_end = (index + 1) * fanout_length
-        if ix_end == raw_sample_data["label_hop_offsets"].numel():
-            major_minor_end = raw_sample_data["majors"].numel()
-        else:
-            major_minor_end = raw_sample_data["label_hop_offsets"][ix_end]
-
-        majors = raw_sample_data["majors"][major_minor_start:major_minor_end]
-        minors = raw_sample_data["minors"][major_minor_start:major_minor_end]
-        edge_id = raw_sample_data["edge_id"][major_minor_start:major_minor_end]
-        # don't retrieve edge type for a homogeneous graph
-
-        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
-        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
-
-        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
-
-        num_sampled_edges = (
-            raw_sample_data["label_hop_offsets"][
-                index * fanout_length : (index + 1) * fanout_length + 1
-            ]
-            .diff()
-            .cpu()
-        )
-
-        num_seeds = (majors[: num_sampled_edges[0]].max() + 1).reshape((1,)).cpu()
-        num_sampled_nodes_hops = torch.tensor(
-            [
-                minors[: num_sampled_edges[:i].sum()].max() + 1
-                for i in range(1, fanout_length + 1)
-            ],
-            device="cpu",
-        )
-
-        num_sampled_nodes = torch.concat(
-            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
-        )
-
-        input_index = raw_sample_data["input_index"][
-            raw_sample_data["input_offsets"][index] : raw_sample_data["input_offsets"][
-                index + 1
-            ]
-        ]
-
-        edge_inverse = (
-            (
-                raw_sample_data["edge_inverse"][
-                    (raw_sample_data["input_offsets"][index] * 2) : (
-                        raw_sample_data["input_offsets"][index + 1] * 2
-                    )
-                ]
-            )
-            if "edge_inverse" in raw_sample_data
-            else None
-        )
-
-        if edge_inverse is None:
-            metadata = (
-                input_index,
-                None,  # TODO this will eventually include time
-            )
-        else:
-            metadata = (
-                input_index,
-                edge_inverse.view(2, -1),
-                None,
-                None,  # TODO this will eventually include time
-            )
-
-        return torch_geometric.sampler.SamplerOutput(
-            node=renumber_map.cpu(),
-            row=minors,
-            col=majors,
-            edge=edge_id,
-            batch=renumber_map[:num_seeds],
-            num_sampled_nodes=num_sampled_nodes,
-            num_sampled_edges=num_sampled_edges,
-            metadata=metadata,
-        )
-
-    def _decode(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
-        if "major_offsets" in raw_sample_data:
-            return self.__decode_csc(raw_sample_data, index)
-        else:
-            return self.__decode_coo(raw_sample_data, index)
-
-
-class BaseSampler:
-    def __init__(
-        self,
-        sampler: DistSampler,
-        data: Tuple[
-            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-        ],
-        batch_size: int = 16,
-    ):
-        self.__sampler = sampler
-        self.__feature_store, self.__graph_store = data
-        self.__batch_size = batch_size
-
-    def sample_from_nodes(
-        self, index: "torch_geometric.sampler.NodeSamplerInput", **kwargs
-    ) -> Iterator[
-        Union[
-            "torch_geometric.sampler.HeteroSamplerOutput",
-            "torch_geometric.sampler.SamplerOutput",
-        ]
-    ]:
-        reader = self.__sampler.sample_from_nodes(
-            index.node, batch_size=self.__batch_size, input_id=index.input_id, **kwargs
-        )
-
-        edge_attrs = self.__graph_store.get_all_edge_attrs()
-        if (
-            len(edge_attrs) == 1
-            and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
-        ):
-            return HomogeneousSampleReader(reader)
-        else:
-            # TODO implement heterogeneous sampling
-            raise NotImplementedError(
-                "Sampling heterogeneous graphs is currently"
-                " unsupported in the non-dask API"
-            )
-
-    def sample_from_edges(
-        self,
-        index: "torch_geometric.sampler.EdgeSamplerInput",
-        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"],
-        **kwargs,
-    ) -> Iterator[
-        Union[
-            "torch_geometric.sampler.HeteroSamplerOutput",
-            "torch_geometric.sampler.SamplerOutput",
-        ]
-    ]:
-        src = index.row
-        dst = index.col
-        input_id = index.input_id
-        neg_batch_size = 0
-        if neg_sampling:
-            # Sample every negative subset at once.
-            # TODO handle temporal sampling (node_time)
-            src_neg, dst_neg = neg_sample(
-                self.__graph_store,
-                index.row,
-                index.col,
-                self.__batch_size,
-                neg_sampling,
-                None,  # src_time,
-                None,  # src_node_time,
-            )
-            if neg_sampling.is_binary():
-                src, _ = neg_cat(src.cuda(), src_neg, self.__batch_size)
-            else:
-                # triplet, cat dst to src so length is the same; will
-                # result in the same set of unique vertices
-                src, _ = neg_cat(src.cuda(), dst_neg, self.__batch_size)
-            dst, neg_batch_size = neg_cat(dst.cuda(), dst_neg, self.__batch_size)
-
-            # Concatenate -1s so the input id tensor lines up and can
-            # be processed by the dist sampler.
-            # When loading the output batch, '-1' will be dropped.
-            input_id, _ = neg_cat(
-                input_id,
-                torch.full(
-                    (dst_neg.numel(),), -1, dtype=torch.int64, device=input_id.device
-                ),
-                self.__batch_size,
-            )
-
-        # TODO for temporal sampling, node times have to be
-        # adjusted here.
-        reader = self.__sampler.sample_from_edges(
-            torch.stack([src, dst]),  # reverse of usual convention
-            input_id=input_id,
-            batch_size=self.__batch_size + neg_batch_size,
-            **kwargs,
-        )
-
-        edge_attrs = self.__graph_store.get_all_edge_attrs()
-        if (
-            len(edge_attrs) == 1
-            and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
-        ):
-            return HomogeneousSampleReader(reader)
-        else:
-            # TODO implement heterogeneous sampling
-            raise NotImplementedError(
-                "Sampling heterogeneous graphs is currently"
-                " unsupported in the non-dask API"
-            )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
deleted file mode 100644
index b3d56ef9992..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
+++ /dev/null
@@ -1,531 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Sequence, Dict, Tuple
-
-from math import ceil
-
-from cugraph_pyg.data import GraphStore, DaskGraphStore
-
-from cugraph.utilities.utils import import_optional
-import cudf
-import cupy
-import pylibcugraph
-
-dask_cudf = import_optional("dask_cudf")
-torch_geometric = import_optional("torch_geometric")
-
-torch = import_optional("torch")
-HeteroSamplerOutput = torch_geometric.sampler.base.HeteroSamplerOutput
-
-
-def _get_unique_nodes(
-    sampling_results: cudf.DataFrame,
-    graph_store: DaskGraphStore,
-    node_type: str,
-    node_position: str,
-) -> int:
-    """
-    Counts the number of unique nodes of a given node type.
-
-    Parameters
-    ----------
-    sampling_results: cudf.DataFrame
-        The dataframe containing sampling results or filtered sampling results
-        (i.e. sampling results for hop 2)
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    node_type: str
-        The node type to count the number of unique nodes of.
-    node_position: str ('src' or 'dst')
-        Whether to examine source or destination nodes.
-
-    Returns
-    -------
-    cudf.Series
-        The unique nodes of the given node type.
-    """
-    if node_position == "src":
-        edge_index = "majors"
-        edge_sel = 0
-    elif node_position == "dst":
-        edge_index = "minors"
-        edge_sel = -1
-    else:
-        raise ValueError(f"Illegal value {node_position} for node_position")
-
-    etypes = [
-        graph_store.canonical_edge_type_to_numeric(et)
-        for et in graph_store.edge_types
-        if et[edge_sel] == node_type
-    ]
-    if len(etypes) > 0:
-        f = sampling_results.edge_type == etypes[0]
-        for et in etypes[1:]:
-            f |= sampling_results.edge_type == et
-
-        sampling_results_node = sampling_results[f]
-    else:
-        return cudf.Series([], dtype="int64")
-
-    return sampling_results_node[edge_index]
-
-
-def _sampler_output_from_sampling_results_homogeneous_coo(
-    sampling_results: cudf.DataFrame,
-    renumber_map: torch.Tensor,
-    graph_store: DaskGraphStore,
-    data_index: Dict[Tuple[int, int], Dict[str, int]],
-    batch_id: int,
-    metadata: Sequence = None,
-) -> HeteroSamplerOutput:
-    """
-    Parameters
-    ----------
-    sampling_results: cudf.DataFrame
-        The dataframe containing sampling results.
-    renumber_map: torch.Tensor
-        The tensor containing the renumber map, or None if there
-        is no renumber map.
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    data_index: Dict[Tuple[int, int], Dict[str, int]]
-        Dictionary where keys are the batch id and hop id,
-        and values are dictionaries containing the max src
-        and max dst node ids for the batch and hop.
-    batch_id: int
-        The current batch id, whose samples are being retrieved
-        from the sampling results and data index.
-    metadata: Tensor
-        The metadata for the sampled batch.
-
-    Returns
-    -------
-    HeteroSamplerOutput
-    """
-
-    if len(graph_store.edge_types) > 1 or len(graph_store.node_types) > 1:
-        raise ValueError("Graph is heterogeneous")
-
-    hops = torch.arange(
-        sampling_results.hop_id.iloc[len(sampling_results) - 1] + 1, device="cuda"
-    )
-    hops = torch.searchsorted(
-        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
-    )
-
-    node_type = graph_store.node_types[0]
-    edge_type = graph_store.edge_types[0]
-
-    num_nodes_per_hop_dict = {node_type: torch.zeros(len(hops) + 1, dtype=torch.int64)}
-    num_edges_per_hop_dict = {edge_type: torch.zeros(len(hops), dtype=torch.int64)}
-
-    if renumber_map is None:
-        raise ValueError("Renumbered input is expected for homogeneous graphs")
-
-    noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
-
-    row_dict = {
-        edge_type: torch.as_tensor(sampling_results.majors, device="cuda"),
-    }
-
-    col_dict = {
-        edge_type: torch.as_tensor(sampling_results.minors, device="cuda"),
-    }
-
-    num_nodes_per_hop_dict[node_type][0] = data_index[batch_id, 0]["src_max"] + 1
-    for hop in range(len(hops)):
-        hop_ix_start = hops[hop]
-        hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
-
-        if num_nodes_per_hop_dict[node_type][hop] > 0:
-            max_id_hop = data_index[batch_id, hop]["dst_max"]
-            max_id_prev_hop = (
-                data_index[batch_id, hop - 1]["dst_max"]
-                if hop > 0
-                else data_index[batch_id, 0]["src_max"]
-            )
-
-            if max_id_hop > max_id_prev_hop:
-                num_nodes_per_hop_dict[node_type][hop + 1] = (
-                    max_id_hop - max_id_prev_hop
-                )
-            else:
-                num_nodes_per_hop_dict[node_type][hop + 1] = 0
-        # will default to 0 if the previous hop was 0, since this is a PyG requirement
-
-        num_edges_per_hop_dict[edge_type][hop] = hop_ix_end - hop_ix_start
-
-    if HeteroSamplerOutput is None:
-        raise ImportError("Error importing from pyg")
-
-    return HeteroSamplerOutput(
-        node=noi_index,
-        row=row_dict,
-        col=col_dict,
-        edge=None,
-        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
-        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
-        metadata=metadata,
-    )
-
-
-def _sampler_output_from_sampling_results_homogeneous_csr(
-    major_offsets: torch.Tensor,
-    minors: torch.Tensor,
-    renumber_map: torch.Tensor,
-    graph_store: DaskGraphStore,
-    label_hop_offsets: torch.Tensor,
-    batch_id: int,
-    metadata: Sequence = None,
-) -> HeteroSamplerOutput:
-    """
-    Parameters
-    ----------
-    major_offsets: torch.Tensor
-        The major offsets for the CSC/CSR matrix ("row pointer")
-    minors: torch.Tensor
-        The minors for the CSC/CSR matrix ("col index")
-    renumber_map: torch.Tensor
-        The tensor containing the renumber map.
-        Required.
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    label_hop_offsets: torch.Tensor
-        The tensor containing the label-hop offsets.
-    batch_id: int
-        The current batch id, whose samples are being retrieved
-        from the sampling results and data index.
-    metadata: Tensor
-        The metadata for the sampled batch.
-
-    Returns
-    -------
-    HeteroSamplerOutput
-    """
-
-    if len(graph_store.edge_types) > 1 or len(graph_store.node_types) > 1:
-        raise ValueError("Graph is heterogeneous")
-
-    if renumber_map is None:
-        raise ValueError("Renumbered input is expected for homogeneous graphs")
-    node_type = graph_store.node_types[0]
-    edge_type = graph_store.edge_types[0]
-
-    major_offsets = major_offsets.clone() - major_offsets[0]
-    label_hop_offsets = label_hop_offsets.clone() - label_hop_offsets[0]
-
-    num_edges_per_hop_dict = {
-        edge_type: major_offsets[label_hop_offsets].diff().tolist()
-    }
-
-    label_hop_offsets = label_hop_offsets.cpu()
-    num_nodes_per_hop_dict = {
-        node_type: torch.concat(
-            [
-                label_hop_offsets.diff(),
-                (renumber_map.shape[0] - label_hop_offsets[-1]).reshape((1,)),
-            ]
-        ).tolist()
-    }
-
-    noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
-
-    col_dict = {
-        edge_type: major_offsets,
-    }
-
-    row_dict = {
-        edge_type: minors,
-    }
-
-    if HeteroSamplerOutput is None:
-        raise ImportError("Error importing from pyg")
-
-    return HeteroSamplerOutput(
-        node=noi_index,
-        row=row_dict,
-        col=col_dict,
-        edge=None,
-        num_sampled_nodes=num_nodes_per_hop_dict,
-        num_sampled_edges=num_edges_per_hop_dict,
-        metadata=metadata,
-    )
-
-
-def _sampler_output_from_sampling_results_heterogeneous(
-    sampling_results: cudf.DataFrame,
-    renumber_map: cudf.Series,
-    graph_store: DaskGraphStore,
-    metadata: Sequence = None,
-) -> HeteroSamplerOutput:
-    """
-    Parameters
-    ----------
-    sampling_results: cudf.DataFrame
-        The dataframe containing sampling results.
-    renumber_map: cudf.Series
-        The series containing the renumber map, or None if there
-        is no renumber map.
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    metadata: Tensor
-        The metadata for the sampled batch.
-
-    Returns
-    -------
-    HeteroSamplerOutput
-    """
-
-    hops = torch.arange(sampling_results.hop_id.max() + 1, device="cuda")
-    hops = torch.searchsorted(
-        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
-    )
-
-    num_nodes_per_hop_dict = {}
-    num_edges_per_hop_dict = {}
-
-    # Fill out hop 0 in num_nodes_per_hop_dict, which is based on src instead of dst
-    sampling_results_hop_0 = sampling_results.iloc[
-        0 : (hops[1] if len(hops) > 1 else len(sampling_results))
-    ]
-
-    for node_type in graph_store.node_types:
-        num_unique_nodes = _get_unique_nodes(
-            sampling_results_hop_0, graph_store, node_type, "src"
-        ).nunique()
-
-        if num_unique_nodes > 0:
-            num_nodes_per_hop_dict[node_type] = torch.zeros(
-                len(hops) + 1, dtype=torch.int64
-            )
-            num_nodes_per_hop_dict[node_type][0] = num_unique_nodes
-
-    if renumber_map is not None:
-        raise ValueError(
-            "Precomputing the renumber map is currently "
-            "unsupported for heterogeneous graphs."
-        )
-
-    # Calculate nodes of interest based on unique nodes in order of appearance
-    # Use hop 0 sources since those are the only ones not included in destinations
-    # Use torch.concat based on benchmark performance (vs. cudf.concat)
-
-    if sampling_results_hop_0 is None:
-        sampling_results_hop_0 = sampling_results.iloc[
-            0 : (hops[1] if len(hops) > 1 else len(sampling_results))
-        ]
-
-    nodes_of_interest = (
-        cudf.Series(
-            torch.concat(
-                [
-                    torch.as_tensor(sampling_results_hop_0.majors, device="cuda"),
-                    torch.as_tensor(sampling_results.minors, device="cuda"),
-                ]
-            ),
-            name="nodes_of_interest",
-        )
-        .drop_duplicates()
-        .sort_index()
-    )
-
-    # Get the grouped node index (for creating the renumbered grouped edge index)
-    noi_index = graph_store._get_vertex_groups_from_sample(
-        torch.as_tensor(nodes_of_interest, device="cuda")
-    )
-    del nodes_of_interest
-
-    # Get the new edge index (by type as expected for HeteroData)
-    # FIXME handle edge ids/types after the C++ updates
-    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-        sampling_results, noi_index
-    )
-
-    for hop in range(len(hops)):
-        hop_ix_start = hops[hop]
-        hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
-        sampling_results_to_hop = sampling_results.iloc[0:hop_ix_end]
-
-        for node_type in graph_store.node_types:
-            unique_nodes_hop = _get_unique_nodes(
-                sampling_results_to_hop, graph_store, node_type, "dst"
-            )
-
-            unique_nodes_0 = _get_unique_nodes(
-                sampling_results_hop_0, graph_store, node_type, "src"
-            )
-
-            num_unique_nodes = cudf.concat([unique_nodes_0, unique_nodes_hop]).nunique()
-
-            if num_unique_nodes > 0:
-                if node_type not in num_nodes_per_hop_dict:
-                    num_nodes_per_hop_dict[node_type] = torch.zeros(
-                        len(hops) + 1, dtype=torch.int64
-                    )
-                num_nodes_per_hop_dict[node_type][hop + 1] = num_unique_nodes - int(
-                    num_nodes_per_hop_dict[node_type][: hop + 1].sum(0)
-                )
-
-        numeric_etypes, counts = torch.unique(
-            torch.as_tensor(
-                sampling_results.iloc[hop_ix_start:hop_ix_end].edge_type,
-                device="cuda",
-            ),
-            return_counts=True,
-        )
-        numeric_etypes = list(numeric_etypes)
-        counts = list(counts)
-        for num_etype, count in zip(numeric_etypes, counts):
-            can_etype = graph_store.numeric_edge_type_to_canonical(num_etype)
-            if can_etype not in num_edges_per_hop_dict:
-                num_edges_per_hop_dict[can_etype] = torch.zeros(
-                    len(hops), dtype=torch.int64
-                )
-            num_edges_per_hop_dict[can_etype][hop] = count
-
-    if HeteroSamplerOutput is None:
-        raise ImportError("Error importing from pyg")
-
-    return HeteroSamplerOutput(
-        node=noi_index,
-        row=row_dict,
-        col=col_dict,
-        edge=None,
-        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
-        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
-        metadata=metadata,
-    )
-
-
-def filter_cugraph_pyg_store(
-    feature_store,
-    graph_store,
-    node,
-    row,
-    col,
-    edge,
-    clx,
-) -> "torch_geometric.data.Data":
-    data = torch_geometric.data.Data()
-
-    data.edge_index = torch.stack([row, col], dim=0)
-
-    required_attrs = []
-    for attr in feature_store.get_all_tensor_attrs():
-        attr.index = edge if isinstance(attr.group_name, tuple) else node
-        required_attrs.append(attr)
-        data.num_nodes = attr.index.size(0)
-
-    tensors = feature_store.multi_get_tensor(required_attrs)
-    for i, attr in enumerate(required_attrs):
-        data[attr.attr_name] = tensors[i]
-
-    return data
-
-
-def neg_sample(
-    graph_store: GraphStore,
-    seed_src: "torch.Tensor",
-    seed_dst: "torch.Tensor",
-    batch_size: int,
-    neg_sampling: "torch_geometric.sampler.NegativeSampling",
-    time: "torch.Tensor",
-    node_time: "torch.Tensor",
-) -> Tuple["torch.Tensor", "torch.Tensor"]:
-    try:
-        # Compatibility for PyG 2.5
-        src_weight = neg_sampling.src_weight
-        dst_weight = neg_sampling.dst_weight
-    except AttributeError:
-        src_weight = neg_sampling.weight
-        dst_weight = neg_sampling.weight
-    unweighted = src_weight is None and dst_weight is None
-
-    # Require at least one negative edge per batch
-    num_neg = max(
-        int(ceil(neg_sampling.amount * seed_src.numel())),
-        int(ceil(seed_src.numel() / batch_size)),
-    )
-
-    if graph_store.is_multi_gpu:
-        num_neg_global = torch.tensor([num_neg], device="cuda")
-        torch.distributed.all_reduce(num_neg_global, op=torch.distributed.ReduceOp.SUM)
-        num_neg = int(num_neg_global)
-    else:
-        num_neg_global = num_neg
-
-    if node_time is None:
-        result_dict = pylibcugraph.negative_sampling(
-            graph_store._resource_handle,
-            graph_store._graph,
-            num_neg_global,
-            vertices=None
-            if unweighted
-            else cupy.arange(src_weight.numel(), dtype="int64"),
-            src_bias=None if src_weight is None else cupy.asarray(src_weight),
-            dst_bias=None if dst_weight is None else cupy.asarray(dst_weight),
-            remove_duplicates=False,
-            remove_false_negatives=False,
-            exact_number_of_samples=True,
-            do_expensive_check=False,
-        )
-
-        src_neg = torch.as_tensor(result_dict["sources"], device="cuda")[:num_neg]
-        dst_neg = torch.as_tensor(result_dict["destinations"], device="cuda")[:num_neg]
-
-        # TODO modifiy the C API so this condition is impossible
-        if src_neg.numel() < num_neg:
-            num_gen = num_neg - src_neg.numel()
-            src_neg = torch.concat(
-                [
-                    src_neg,
-                    torch.randint(
-                        0, src_neg.max(), (num_gen,), device="cuda", dtype=torch.int64
-                    ),
-                ]
-            )
-            dst_neg = torch.concat(
-                [
-                    dst_neg,
-                    torch.randint(
-                        0, dst_neg.max(), (num_gen,), device="cuda", dtype=torch.int64
-                    ),
-                ]
-            )
-        return src_neg, dst_neg
-    raise NotImplementedError(
-        "Temporal negative sampling is currently unimplemented in cuGraph-PyG"
-    )
-
-
-def neg_cat(
-    seed_pos: "torch.Tensor", seed_neg: "torch.Tensor", pos_batch_size: int
-) -> Tuple["torch.Tensor", int]:
-    num_seeds = seed_pos.numel()
-    num_batches = int(ceil(num_seeds / pos_batch_size))
-    neg_batch_size = int(ceil(seed_neg.numel() / num_batches))
-
-    batch_pos_offsets = torch.full((num_batches,), pos_batch_size).cumsum(-1)[:-1]
-    seed_pos_splits = torch.tensor_split(seed_pos, batch_pos_offsets)
-
-    batch_neg_offsets = torch.full((num_batches,), neg_batch_size).cumsum(-1)[:-1]
-    seed_neg_splits = torch.tensor_split(seed_neg, batch_neg_offsets)
-
-    return (
-        torch.concatenate(
-            [torch.concatenate(s) for s in zip(seed_pos_splits, seed_neg_splits)]
-        ),
-        neg_batch_size,
-    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
deleted file mode 100644
index 30994289f9c..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pytest
-
-from dask_cuda.initialize import initialize as dask_initialize
-from dask_cuda import LocalCUDACluster
-from dask.distributed import Client
-from cugraph.dask.comms import comms as Comms
-from cugraph.dask.common.mg_utils import get_visible_devices
-from cugraph.testing.mg_utils import stop_dask_client
-
-import torch
-import numpy as np
-from cugraph.gnn import FeatureStore
-from cugraph.datasets import karate
-
-import tempfile
-
-# module-wide fixtures
-
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ImportError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
-
-@pytest.fixture(scope="module")
-def dask_client():
-    dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
-    cuda_visible_devices = get_visible_devices()
-
-    if dask_scheduler_file is not None:
-        dask_initialize()
-        dask_client = Client(scheduler_file=dask_scheduler_file)
-    else:
-        # The tempdir created by tempdir_object should be cleaned up once
-        # tempdir_object goes out-of-scope and is deleted.
-        tempdir_object = tempfile.TemporaryDirectory()
-        cluster = LocalCUDACluster(
-            local_directory=tempdir_object.name,
-            protocol="tcp",
-            CUDA_VISIBLE_DEVICES=cuda_visible_devices,
-        )
-
-        dask_client = Client(cluster)
-        dask_client.wait_for_workers(len(cuda_visible_devices))
-
-    if not Comms.is_initialized():
-        Comms.initialize(p2p=True)
-
-    yield dask_client
-
-    stop_dask_client(dask_client)
-    print("\ndask_client fixture: client.close() called")
-
-
-@pytest.fixture
-def karate_gnn():
-    el = karate.get_edgelist().reset_index(drop=True)
-    el.src = el.src.astype("int64")
-    el.dst = el.dst.astype("int64")
-    all_vertices = np.array_split(np.arange(34), 2)
-
-    F = FeatureStore(backend="torch")
-    F.add_data(
-        torch.arange(len(all_vertices[0]), dtype=torch.float32) * 31,
-        "type0",
-        "prop0",
-    )
-    F.add_data(
-        torch.arange(len(all_vertices[1]), dtype=torch.float32) * 41,
-        "type1",
-        "prop0",
-    )
-
-    N = {
-        "type0": len(all_vertices[0]),
-        "type1": len(all_vertices[1]),
-    }
-
-    offsets = {"type0": 0, "type1": N["type0"]}
-
-    G = {
-        ("type0", "et01", "type1"): el[
-            el.src.isin(all_vertices[0]) & el.dst.isin(all_vertices[1])
-        ].reset_index(drop=True),
-        ("type1", "et10", "type0"): el[
-            el.src.isin(all_vertices[1]) & el.dst.isin(all_vertices[0])
-        ].reset_index(drop=True),
-        ("type0", "et00", "type0"): el[
-            el.src.isin(all_vertices[0]) & el.dst.isin(all_vertices[0])
-        ],
-        ("type1", "et11", "type1"): el[
-            el.src.isin(all_vertices[1]) & el.dst.isin(all_vertices[1])
-        ].reset_index(drop=True),
-    }
-
-    G = {
-        (src_type, edge_type, dst_type): (
-            torch.tensor(elx["src"].values_host - offsets[src_type]),
-            torch.tensor(elx["dst"].values_host - offsets[dst_type]),
-        )
-        for (src_type, edge_type, dst_type), elx in G.items()
-    }
-
-    return F, G, N
-
-
-@pytest.fixture
-def basic_graph_1():
-    G = {
-        ("vt1", "pig", "vt1"): [
-            torch.tensor([0, 0, 1, 2, 2, 3]),
-            torch.tensor([1, 2, 4, 3, 4, 1]),
-        ]
-    }
-
-    N = {"vt1": 5}
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([100, 200, 300, 400, 500]), type_name="vt1", feat_name="prop1"
-    )
-
-    F.add_data(torch.tensor([5, 4, 3, 2, 1]), type_name="vt1", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def multi_edge_graph_1():
-    G = {
-        ("vt1", "pig", "vt1"): [torch.tensor([0, 2, 3, 1]), torch.tensor([1, 3, 1, 4])],
-        ("vt1", "dog", "vt1"): [torch.tensor([0, 3, 4]), torch.tensor([2, 2, 3])],
-        ("vt1", "cat", "vt1"): [
-            torch.tensor([1, 2, 2]),
-            torch.tensor([4, 3, 4]),
-        ],
-    }
-
-    N = {"vt1": 5}
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([100, 200, 300, 400, 500]), type_name="vt1", feat_name="prop1"
-    )
-
-    F.add_data(torch.tensor([5, 4, 3, 2, 1]), type_name="vt1", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def multi_edge_multi_vertex_graph_1():
-
-    G = {
-        ("brown", "horse", "brown"): [
-            torch.tensor([0, 0]),
-            torch.tensor([1, 2]),
-        ],
-        ("brown", "tortoise", "black"): [
-            torch.tensor([1, 1, 2]),
-            torch.tensor([1, 0, 1]),
-        ],
-        ("brown", "mongoose", "black"): [
-            torch.tensor([2, 1]),
-            torch.tensor([0, 1]),
-        ],
-        ("black", "cow", "brown"): [
-            torch.tensor([0, 0]),
-            torch.tensor([1, 2]),
-        ],
-        ("black", "snake", "black"): [
-            torch.tensor([1]),
-            torch.tensor([0]),
-        ],
-    }
-
-    N = {"brown": 3, "black": 2}
-
-    F = FeatureStore()
-    F.add_data(torch.tensor([100, 200, 300]), type_name="brown", feat_name="prop1")
-
-    F.add_data(torch.tensor([400, 500]), type_name="black", feat_name="prop1")
-
-    F.add_data(torch.tensor([5, 4, 3]), type_name="brown", feat_name="prop2")
-
-    F.add_data(torch.tensor([2, 1]), type_name="black", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def multi_edge_multi_vertex_no_graph_1():
-    G = {
-        ("brown", "horse", "brown"): 2,
-        ("brown", "tortoise", "black"): 3,
-        ("brown", "mongoose", "black"): 3,
-        ("black", "cow", "brown"): 3,
-        ("black", "snake", "black"): 1,
-    }
-
-    N = {"brown": 3, "black": 2}
-
-    F = FeatureStore()
-    F.add_data(np.array([100, 200, 300]), type_name="brown", feat_name="prop1")
-
-    F.add_data(np.array([400, 500]), type_name="black", feat_name="prop1")
-
-    F.add_data(np.array([5, 4, 3]), type_name="brown", feat_name="prop2")
-
-    F.add_data(np.array([2, 1]), type_name="black", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def abc_graph():
-    N = {
-        "A": 2,  # 0, 1
-        "B": 3,  # 2, 3, 4
-        "C": 4,  # 5, 6, 7, 8
-    }
-
-    G = {
-        # (0->2, 0->3, 1->3)
-        ("A", "ab", "B"): [
-            torch.tensor([0, 0, 1], dtype=torch.int64),
-            torch.tensor([0, 1, 1], dtype=torch.int64),
-        ],
-        # (2->0, 2->1, 3->1, 4->0)
-        ("B", "ba", "A"): [
-            torch.tensor([0, 0, 1, 2], dtype=torch.int64),
-            torch.tensor([0, 1, 1, 0], dtype=torch.int64),
-        ],
-        # (2->6, 2->8, 3->5, 3->7, 4->5, 4->8)
-        ("B", "bc", "C"): [
-            torch.tensor([0, 0, 1, 1, 2, 2], dtype=torch.int64),
-            torch.tensor([1, 3, 0, 2, 0, 3], dtype=torch.int64),
-        ],
-    }
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
-    )
-
-    return F, G, N
-
-
-@pytest.fixture
-def basic_pyg_graph_1():
-    edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
-    size = (4, 4)
-    return edge_index, size
-
-
-@pytest.fixture
-def basic_pyg_graph_2():
-    edge_index = torch.tensor(
-        [
-            [0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9],
-            [1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0],
-        ]
-    )
-    size = (10, 10)
-    return edge_index, size
-
-
-@pytest.fixture
-def sample_pyg_hetero_data():
-    torch.manual_seed(12345)
-    raw_data_dict = {
-        "v0": torch.randn(6, 3),
-        "v1": torch.randn(7, 2),
-        "v2": torch.randn(5, 4),
-        ("v2", "e0", "v1"): torch.tensor([[0, 2, 2, 4, 4], [4, 3, 6, 0, 1]]),
-        ("v1", "e1", "v1"): torch.tensor(
-            [[0, 2, 2, 2, 3, 5, 5], [4, 0, 4, 5, 3, 0, 1]]
-        ),
-        ("v0", "e2", "v0"): torch.tensor([[0, 2, 2, 3, 5, 5], [1, 1, 5, 1, 1, 2]]),
-        ("v1", "e3", "v2"): torch.tensor(
-            [[0, 1, 1, 2, 4, 5, 6], [1, 2, 3, 1, 2, 2, 2]]
-        ),
-        ("v0", "e4", "v2"): torch.tensor([[1, 1, 3, 3, 4, 4], [1, 4, 1, 4, 0, 3]]),
-    }
-
-    # create a nested dictionary to facilitate PyG's HeteroData construction
-    hetero_data_dict = {}
-    for key, value in raw_data_dict.items():
-        if isinstance(key, tuple):
-            hetero_data_dict[key] = {"edge_index": value}
-        else:
-            hetero_data_dict[key] = {"x": value}
-
-    return hetero_data_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
deleted file mode 100644
index 0a997a960b8..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cugraph
-from cugraph_pyg.data.dask_graph_store import (
-    CuGraphTensorAttr,
-    CuGraphEdgeAttr,
-    EdgeLayout,
-)
-from cugraph_pyg.data import DaskGraphStore
-
-import cudf
-import cupy
-import numpy as np
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import pytest
-
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_tensor_attr():
-    ta = CuGraphTensorAttr("group0", "property1")
-    assert not ta.is_fully_specified()
-    assert not ta.is_set("index")
-
-    ta.fully_specify()
-    assert ta.is_fully_specified()
-
-    other_ta = CuGraphTensorAttr(index=[1, 2, 3])
-    ta.update(other_ta)
-    assert ta.index == [1, 2, 3]
-
-    casted_ta1 = CuGraphTensorAttr.cast(ta)
-    assert casted_ta1 == ta
-
-    casted_ta2 = CuGraphTensorAttr.cast(index=[1, 2, 3])
-    assert casted_ta2.index == [1, 2, 3]
-    assert not casted_ta2.is_fully_specified()
-
-    casted_ta3 = CuGraphTensorAttr.cast(
-        "group2",
-        "property2",
-        [1, 2, 3],
-    )
-    assert casted_ta3.group_name == "group2"
-    assert casted_ta3.attr_name == "property2"
-    assert casted_ta3.index == [1, 2, 3]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_edge_attr():
-    ea = CuGraphEdgeAttr("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-    ea = CuGraphEdgeAttr(edge_type="type1", layout="csr", is_sorted=True)
-    assert ea.size is None
-
-    ea = CuGraphEdgeAttr.cast("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-
-@pytest.fixture(
-    params=[
-        "basic_graph_1",
-        "multi_edge_graph_1",
-        "multi_edge_multi_vertex_graph_1",
-    ]
-)
-def graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.fixture(params=["basic_graph_1", "multi_edge_graph_1"])
-def single_vertex_graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize("edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf"])
-@pytest.mark.sg
-def test_get_edge_index(graph, edge_index_type):
-    F, G, N = graph
-    if "torch" in edge_index_type:
-        if edge_index_type == "torch-cpu":
-            device = "cpu"
-        else:
-            device = "cuda"
-        for et in list(G.keys()):
-            G[et][0] = torch.as_tensor(G[et][0], device=device)
-            G[et][1] = torch.as_tensor(G[et][1], device=device)
-    elif edge_index_type == "cudf":
-        for et in list(G.keys()):
-            G[et][0] = cudf.Series(G[et][0])
-            G[et][1] = cudf.Series(G[et][1])
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSC")
-
-    for pyg_can_edge_type in G:
-        src, dst = cugraph_store.get_edge_index(
-            edge_type=pyg_can_edge_type, layout="coo", is_sorted=False
-        )
-
-        if edge_index_type == "cudf":
-            assert G[pyg_can_edge_type][0].values_host.tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].values_host.tolist() == dst.tolist()
-        else:
-            assert G[pyg_can_edge_type][0].tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].tolist() == dst.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_edge_types(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    eta = cugraph_store._edge_types_to_attrs
-    assert eta.keys() == G.keys()
-
-    for attr_name, attr_repr in eta.items():
-        src_size = N[attr_name[0]]
-        dst_size = N[attr_name[-1]]
-        assert src_size == attr_repr.size[0]
-        assert dst_size == attr_repr.size[-1]
-        assert attr_name == attr_repr.edge_type
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_subgraph(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    if len(G.keys()) > 1:
-        for edge_type in G.keys():
-            # Subgraphing is not implemented yet and should raise an error
-            with pytest.raises(ValueError):
-                sg = cugraph_store._subgraph([edge_type])
-
-    sg = cugraph_store._subgraph(list(G.keys()))
-    assert isinstance(sg, cugraph.MultiGraph)
-
-    num_edges = sum([len(v[0]) for v in G.values()])
-    assert sg.number_of_edges() == num_edges
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_renumber_vertices_basic(single_vertex_graph):
-    F, G, N = single_vertex_graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    )
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-    assert index["vt1"].tolist() == nodes_of_interest.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph_1):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    ).unique()
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-
-    black_nodes = nodes_of_interest[nodes_of_interest <= 1]
-    brown_nodes = nodes_of_interest[nodes_of_interest > 1] - 2
-
-    if len(black_nodes) > 0:
-        assert index["black"].tolist() == sorted(black_nodes.tolist())
-    if len(brown_nodes) > 0:
-        assert index["brown"].tolist() == sorted(brown_nodes.tolist())
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_renumber_edges(abc_graph):
-    F, G, N = abc_graph
-
-    graph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    mock_noi_index = {
-        "A": torch.tensor([0, 1], device="cuda"),
-        "B": torch.tensor([0, 1], device="cuda"),
-        "C": torch.tensor([3, 2, 0], device="cuda"),
-    }
-
-    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-        mock_sampling_results, mock_noi_index
-    )
-
-    assert len(row_dict) == 3
-    assert len(col_dict) == 3
-    assert row_dict[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert col_dict[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert row_dict[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert col_dict[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert row_dict[("B", "ba", "A")].tolist() == [1, 1]
-    assert col_dict[("B", "ba", "A")].tolist() == [1, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    for feature_name, feature_on_types in F.get_feature_list().items():
-        for type_name in feature_on_types:
-            v_ids = np.arange(N[type_name])
-            base_series = F.get_data(
-                v_ids,
-                type_name=type_name,
-                feat_name=feature_name,
-            ).tolist()
-
-            tsr = cugraph_store.get_tensor(
-                type_name, feature_name, v_ids, None, cupy.int64
-            ).tolist()
-
-            assert tsr == base_series
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor_empty_idx(karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    t = cugraph_store.get_tensor(
-        CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
-    )
-    assert t.tolist() == (torch.arange(17, dtype=torch.float32) * 31).tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_multi_get_tensor(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    for vertex_type in sorted(N.keys()):
-        v_ids = np.arange(N[vertex_type])
-        feat_names = list(F.get_feature_list().keys())
-        base_series = None
-        for feat_name in feat_names:
-            if base_series is None:
-                base_series = F.get_data(v_ids, vertex_type, feat_name)
-            else:
-                base_series = np.stack(
-                    [base_series, F.get_data(v_ids, vertex_type, feat_name)]
-                )
-
-        tsr = cugraph_store.multi_get_tensor(
-            [
-                CuGraphTensorAttr(vertex_type, feat_name, v_ids)
-                for feat_name in feat_names
-            ]
-        )
-
-        assert torch.stack(tsr).tolist() == base_series.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_all_tensor_attrs(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    tensor_attrs = []
-    for vertex_type in sorted(N.keys()):
-        for prop in ["prop1", "prop2"]:
-            tensor_attrs.append(
-                CuGraphTensorAttr(
-                    vertex_type,
-                    prop,
-                    properties=None,
-                    dtype=F.get_data([0], vertex_type, "prop1").dtype,
-                )
-            )
-
-    for t in tensor_attrs:
-        print(t)
-
-    print("\n\n")
-
-    for t in cugraph_store.get_all_tensor_attrs():
-        print(t)
-
-    assert sorted(tensor_attrs, key=lambda a: (a.group_name, a.attr_name)) == sorted(
-        cugraph_store.get_all_tensor_attrs(), key=lambda a: (a.group_name, a.attr_name)
-    )
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor_from_tensor_attrs(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        v_ids = np.arange(N[tensor_attr.group_name])
-        data = F.get_data(v_ids, tensor_attr.group_name, tensor_attr.attr_name)
-
-        tensor_attr.index = v_ids
-        assert cugraph_store.get_tensor(tensor_attr).tolist() == data.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor_size(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        sz = N[tensor_attr.group_name]
-
-        tensor_attr.index = np.arange(sz)
-        assert cugraph_store.get_tensor_size(tensor_attr) == torch.Size((sz,))
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(torch_geometric, MissingModule), reason="pyg not available"
-)
-@pytest.mark.sg
-def test_get_input_nodes(karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    input_node_info = torch_geometric.loader.utils.get_input_nodes(
-        (cugraph_store, cugraph_store), "type0"
-    )
-
-    # PyG 2.4
-    if len(input_node_info) == 2:
-        node_type, input_nodes = input_node_info
-    # PyG 2.5
-    elif len(input_node_info) == 3:
-        node_type, input_nodes, input_id = input_node_info
-    # Invalid
-    else:
-        raise ValueError("Invalid output from get_input_nodes")
-
-    assert node_type == "type0"
-    assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
-
-
-@pytest.mark.sg
-def test_serialize(multi_edge_multi_vertex_no_graph_1):
-    import pickle
-
-    F, G, N = multi_edge_multi_vertex_no_graph_1
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    cugraph_store_copy = pickle.loads(pickle.dumps(cugraph_store))
-
-    for tensor_attr in cugraph_store.get_all_tensor_attrs():
-        sz = cugraph_store.get_tensor_size(tensor_attr)[0]
-        tensor_attr.index = np.arange(sz)
-        assert (
-            cugraph_store.get_tensor(tensor_attr).tolist()
-            == cugraph_store_copy.get_tensor(tensor_attr).tolist()
-        )
-
-    # Currently does not store edgelist properly for SG
-    """
-    for edge_attr in cugraph_store.get_all_edge_attrs():
-        assert cugraph_store.get_edge_index(edge_attr) \
-            == cugraph_store_copy.get_edge_index(edge_attr)
-    """
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
deleted file mode 100644
index 65cb8984586..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cugraph
-from cugraph_pyg.data.dask_graph_store import (
-    CuGraphTensorAttr,
-    CuGraphEdgeAttr,
-    EdgeLayout,
-)
-from cugraph_pyg.data import DaskGraphStore
-
-import cudf
-import dask_cudf
-import cupy
-import numpy as np
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import pytest
-
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_tensor_attr():
-    ta = CuGraphTensorAttr("group0", "property1")
-    assert not ta.is_fully_specified()
-    assert not ta.is_set("index")
-
-    ta.fully_specify()
-    assert ta.is_fully_specified()
-
-    other_ta = CuGraphTensorAttr(index=[1, 2, 3])
-    ta.update(other_ta)
-    assert ta.index == [1, 2, 3]
-
-    casted_ta1 = CuGraphTensorAttr.cast(ta)
-    assert casted_ta1 == ta
-
-    casted_ta2 = CuGraphTensorAttr.cast(index=[1, 2, 3])
-    assert casted_ta2.index == [1, 2, 3]
-    assert not casted_ta2.is_fully_specified()
-
-    casted_ta3 = CuGraphTensorAttr.cast(
-        "group2",
-        "property2",
-        [1, 2, 3],
-    )
-    assert casted_ta3.group_name == "group2"
-    assert casted_ta3.attr_name == "property2"
-    assert casted_ta3.index == [1, 2, 3]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_edge_attr():
-    ea = CuGraphEdgeAttr("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-    ea = CuGraphEdgeAttr(edge_type="type1", layout="csr", is_sorted=True)
-    assert ea.size is None
-
-    ea = CuGraphEdgeAttr.cast("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-
-@pytest.fixture(
-    params=[
-        "basic_graph_1",
-        "multi_edge_graph_1",
-        "multi_edge_multi_vertex_graph_1",
-    ]
-)
-def graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.fixture(params=["basic_graph_1", "multi_edge_graph_1"])
-def single_vertex_graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize(
-    "edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf", "dask-cudf"]
-)
-@pytest.mark.mg
-def test_get_edge_index(graph, edge_index_type, dask_client):
-    F, G, N = graph
-    if "torch" in edge_index_type:
-        if edge_index_type == "torch-cpu":
-            device = "cpu"
-        else:
-            device = "cuda"
-        for et in list(G.keys()):
-            G[et][0] = torch.as_tensor(G[et][0], device=device)
-            G[et][1] = torch.as_tensor(G[et][1], device=device)
-    elif edge_index_type == "cudf":
-        for et in list(G.keys()):
-            G[et][0] = cudf.Series(G[et][0])
-            G[et][1] = cudf.Series(G[et][1])
-    elif edge_index_type == "dask-cudf":
-        for et in list(G.keys()):
-            G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=1)
-            G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=1)
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSC", multi_gpu=True)
-
-    for pyg_can_edge_type in G:
-        src, dst = cugraph_store.get_edge_index(
-            edge_type=pyg_can_edge_type, layout="coo", is_sorted=False
-        )
-
-        if edge_index_type == "cudf":
-            assert G[pyg_can_edge_type][0].values_host.tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].values_host.tolist() == dst.tolist()
-        elif edge_index_type == "dask-cudf":
-            assert (
-                G[pyg_can_edge_type][0].compute().values_host.tolist() == src.tolist()
-            )
-            assert (
-                G[pyg_can_edge_type][1].compute().values_host.tolist() == dst.tolist()
-            )
-        else:
-            assert G[pyg_can_edge_type][0].tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].tolist() == dst.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_edge_types(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    eta = cugraph_store._edge_types_to_attrs
-    assert eta.keys() == G.keys()
-
-    for attr_name, attr_repr in eta.items():
-        src_size = N[attr_name[0]]
-        dst_size = N[attr_name[-1]]
-        assert src_size == attr_repr.size[0]
-        assert dst_size == attr_repr.size[-1]
-        assert attr_name == attr_repr.edge_type
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_subgraph(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    if len(G.keys()) > 1:
-        for edge_type in G.keys():
-            # Subgraphing is not implemented yet and should raise an error
-            with pytest.raises(ValueError):
-                sg = cugraph_store._subgraph([edge_type])
-
-    sg = cugraph_store._subgraph(list(G.keys()))
-    assert isinstance(sg, cugraph.MultiGraph)
-
-    num_edges = sum([len(v[0]) for v in G.values()])
-    assert sg.number_of_edges() == num_edges
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_renumber_vertices_basic(single_vertex_graph, dask_client):
-    F, G, N = single_vertex_graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    )
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-    assert index["vt1"].tolist() == nodes_of_interest.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_renumber_vertices_multi_edge_multi_vertex(
-    multi_edge_multi_vertex_graph_1, dask_client
-):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    ).unique()
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-
-    black_nodes = nodes_of_interest[nodes_of_interest <= 1]
-    brown_nodes = nodes_of_interest[nodes_of_interest > 1] - 2
-
-    if len(black_nodes) > 0:
-        assert index["black"].tolist() == sorted(black_nodes.tolist())
-    if len(brown_nodes) > 0:
-        assert index["brown"].tolist() == sorted(brown_nodes.tolist())
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_renumber_edges(abc_graph, dask_client):
-    F, G, N = abc_graph
-
-    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    mock_noi_index = {
-        "A": torch.tensor([0, 1], device="cuda"),
-        "B": torch.tensor([0, 1], device="cuda"),
-        "C": torch.tensor([3, 2, 0], device="cuda"),
-    }
-
-    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-        mock_sampling_results, mock_noi_index
-    )
-
-    assert len(row_dict) == 3
-    assert len(col_dict) == 3
-    assert row_dict[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert col_dict[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert row_dict[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert col_dict[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert row_dict[("B", "ba", "A")].tolist() == [1, 1]
-    assert col_dict[("B", "ba", "A")].tolist() == [1, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    for feature_name, feature_on_types in F.get_feature_list().items():
-        for type_name in feature_on_types:
-            v_ids = np.arange(N[type_name])
-            base_series = F.get_data(
-                v_ids,
-                type_name=type_name,
-                feat_name=feature_name,
-            ).tolist()
-
-            tsr = cugraph_store.get_tensor(
-                type_name, feature_name, v_ids, None, cupy.int64
-            ).tolist()
-
-            assert tsr == base_series
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor_empty_idx(karate_gnn, dask_client):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    t = cugraph_store.get_tensor(
-        CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
-    )
-    assert t.tolist() == (torch.arange(17, dtype=torch.float32) * 31).tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_multi_get_tensor(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    for vertex_type in sorted(N.keys()):
-        v_ids = np.arange(N[vertex_type])
-        feat_names = list(F.get_feature_list().keys())
-        base_series = None
-        for feat_name in feat_names:
-            if base_series is None:
-                base_series = F.get_data(v_ids, vertex_type, feat_name)
-            else:
-                base_series = np.stack(
-                    [base_series, F.get_data(v_ids, vertex_type, feat_name)]
-                )
-
-        tsr = cugraph_store.multi_get_tensor(
-            [
-                CuGraphTensorAttr(vertex_type, feat_name, v_ids)
-                for feat_name in feat_names
-            ]
-        )
-
-        assert torch.stack(tsr).tolist() == base_series.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_all_tensor_attrs(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    tensor_attrs = []
-    for vertex_type in sorted(N.keys()):
-        for prop in ["prop1", "prop2"]:
-            tensor_attrs.append(
-                CuGraphTensorAttr(
-                    vertex_type,
-                    prop,
-                    properties=None,
-                    dtype=F.get_data([0], vertex_type, "prop1").dtype,
-                )
-            )
-
-    assert sorted(tensor_attrs, key=lambda a: (a.group_name, a.attr_name)) == sorted(
-        cugraph_store.get_all_tensor_attrs(), key=lambda a: (a.group_name, a.attr_name)
-    )
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor_from_tensor_attrs(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        v_ids = np.arange(N[tensor_attr.group_name])
-        data = F.get_data(v_ids, tensor_attr.group_name, tensor_attr.attr_name)
-
-        tensor_attr.index = v_ids
-        assert cugraph_store.get_tensor(tensor_attr).tolist() == data.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor_size(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        sz = N[tensor_attr.group_name]
-
-        tensor_attr.index = np.arange(sz)
-        assert cugraph_store.get_tensor_size(tensor_attr) == torch.Size((sz,))
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(torch_geometric, MissingModule), reason="pyg not available"
-)
-@pytest.mark.mg
-def test_get_input_nodes(karate_gnn, dask_client):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    nodes = torch_geometric.loader.utils.get_input_nodes(
-        (cugraph_store, cugraph_store), "type0"
-    )
-
-    if len(nodes) == 2:
-        node_type, input_nodes = nodes
-    else:
-        node_type, input_nodes, _ = nodes
-
-    assert node_type == "type0"
-    assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_mg_frame_handle(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-    assert isinstance(cugraph_store._DaskGraphStore__graph._plc_graph, dict)
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_cugraph_loader_large_index(dask_client):
-    large_index = (
-        np.random.randint(0, 1_000_000, (100_000_000,)),
-        np.random.randint(0, 1_000_000, (100_000_000,)),
-    )
-
-    large_features = np.random.randint(0, 50, (1_000_000,))
-    F = cugraph.gnn.FeatureStore(backend="torch")
-    F.add_data(large_features, "N", "f")
-
-    store = DaskGraphStore(
-        F,
-        {("N", "e", "N"): large_index},
-        {"N": 1_000_000},
-        multi_gpu=True,
-    )
-
-    graph = store._subgraph()
-    assert isinstance(graph, cugraph.Graph)
-
-    el = graph.view_edge_list().compute()
-    assert (el["src"].values_host - large_index[0]).sum() == 0
-    assert (el["dst"].values_host - large_index[1]).sum() == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
deleted file mode 100644
index ab5f1e217bb..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import TensorDictFeatureStore
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_tensordict_feature_store_basic_api():
-    feature_store = TensorDictFeatureStore()
-
-    node_features_0 = torch.randint(128, (100, 1000))
-    node_features_1 = torch.randint(256, (100, 10))
-
-    other_features = torch.randint(1024, (10, 5))
-
-    feature_store["node", "feat0"] = node_features_0
-    feature_store["node", "feat1"] = node_features_1
-    feature_store["other", "feat"] = other_features
-
-    assert (feature_store["node"]["feat0"][:] == node_features_0).all()
-    assert (feature_store["node"]["feat1"][:] == node_features_1).all()
-    assert (feature_store["other"]["feat"][:] == other_features).all()
-
-    assert len(feature_store.get_all_tensor_attrs()) == 3
-
-    del feature_store["node", "feat0"]
-    assert len(feature_store.get_all_tensor_attrs()) == 2
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py
deleted file mode 100644
index f1f514560c8..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import TensorDictFeatureStore, WholeFeatureStore
-
-torch = import_optional("torch")
-pylibwholegraph = import_optional("pylibwholegraph")
-
-
-def run_test_wholegraph_feature_store_basic_api(rank, world_size, dtype):
-    if dtype == "float32":
-        torch_dtype = torch.float32
-    elif dtype == "int64":
-        torch_dtype = torch.int64
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    pylibwholegraph.torch.initialize.init(
-        rank,
-        world_size,
-        rank,
-        world_size,
-    )
-
-    features = torch.arange(0, world_size * 2000)
-    features = features.reshape((features.numel() // 100, 100)).to(torch_dtype)
-
-    tensordict_store = TensorDictFeatureStore()
-    tensordict_store["node", "fea"] = features
-
-    whole_store = WholeFeatureStore()
-    whole_store["node", "fea"] = torch.tensor_split(features, world_size)[rank]
-
-    ix = torch.arange(features.shape[0])
-    assert (
-        whole_store["node", "fea"][ix].cpu() == tensordict_store["node", "fea"][ix]
-    ).all()
-
-    label = torch.arange(0, features.shape[0]).reshape((features.shape[0], 1))
-    tensordict_store["node", "label"] = label
-    whole_store["node", "label"] = torch.tensor_split(label, world_size)[rank]
-
-    assert (
-        whole_store["node", "fea"][ix].cpu() == tensordict_store["node", "fea"][ix]
-    ).all()
-
-    pylibwholegraph.torch.initialize.finalize()
-
-
-@pytest.mark.skipif(
-    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
-)
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize("dtype", ["float32", "int64"])
-@pytest.mark.mg
-def test_wholegraph_feature_store_basic_api(dtype):
-    world_size = torch.cuda.device_count()
-    torch.multiprocessing.spawn(
-        run_test_wholegraph_feature_store_basic_api,
-        args=(
-            world_size,
-            dtype,
-        ),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
deleted file mode 100644
index a8b93665aad..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import GraphStore
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_graph_store_basic_api():
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-
-    graph_store = GraphStore()
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
-
-    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
-
-    assert (ei == rei).all()
-
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 1
-
-    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
deleted file mode 100644
index 14540b7e17d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import GraphStore
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_graph_store_basic_api_mg():
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
-
-    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
-
-    assert (ei == rei).all()
-
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 1
-
-    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
deleted file mode 100644
index 34ef6a59511..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
+++ /dev/null
@@ -1,543 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import tempfile
-import os
-
-import cudf
-import cupy
-import numpy as np
-
-from cugraph_pyg.loader import DaskNeighborLoader
-from cugraph_pyg.loader import BulkSampleLoader
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-from cugraph.gnn import FeatureStore
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from typing import Dict, Tuple
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-trim_to_layer = import_optional("torch_geometric.utils.trim_to_layer")
-if isinstance(trim_to_layer, MissingModule):
-    trim_to_layer = import_optional("torch_geometric.utils._trim_to_layer")
-
-
-try:
-    import torch_sparse  # noqa: F401
-
-    HAS_TORCH_SPARSE = True
-except:  # noqa: E722
-    HAS_TORCH_SPARSE = False
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_basic(
-    karate_gnn: Tuple[
-        FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
-    ]
-):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
-        10,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 3
-    for sample in samples:
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_hetero(
-    karate_gnn: Tuple[
-        FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
-    ]
-):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
-        batch_size=2,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 2
-    for sample in samples:
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_from_disk():
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
-    x = torch.zeros(256, dtype=torch.int32)
-    x[torch.tensor(m, dtype=torch.int32)] = n
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9080}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
-            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-    )
-
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 1, 2, 5, 4, 3, 6]; x = [1, 2, 3, 6, 5, 4, 7]
-        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
-
-        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
-        assert list(edge_index.shape) == [2, 8]
-
-        assert (
-            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
-        )
-        assert (
-            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
-        )
-
-    assert num_samples == 256
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_from_disk_subset():
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
-    x = torch.zeros(256, dtype=torch.int32)
-    x[torch.tensor(m, dtype=torch.int32)] = n
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9080}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
-            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
-        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
-
-        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
-        assert list(edge_index.shape) == [2, 8]
-
-        assert (
-            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
-        )
-        assert (
-            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
-        )
-
-    assert num_samples == 100
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
-@pytest.mark.sg
-def test_cugraph_loader_from_disk_subset_csr():
-    m = [2, 9, 99, 82, 11, 13]
-    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
-    x = torch.zeros(256, dtype=torch.int32)
-    x[torch.tensor(m, dtype=torch.int32)] = n
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9080}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "major_offsets": [0, 3, 5, 7, 8, None, None, None],
-            "minors": [1, 2, 3, 0, 3, 4, 5, 1],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "label_hop_offsets": cudf.Series(
-                [0, 1, 4, None, None, None, None, None], dtype="int32"
-            ),
-            "renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples["map"] = map
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        # offset the offsets
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 6
-
-        assert sample["t0"]["x"].tolist() == [1, 2, 3, 4, 5, 6]
-
-        edge_index = sample[("t0", "knows", "t0")]["adj_t"]
-        assert edge_index.size(0) == 4
-        assert edge_index.size(1) == 6
-
-        colptr, row, _ = edge_index.csr()
-
-        assert (
-            colptr.tolist() == bogus_samples.major_offsets.dropna().values_host.tolist()
-        )
-        assert row.tolist() == bogus_samples.minors.dropna().values_host.tolist()
-
-        assert sample["t0"]["num_sampled_nodes"] == [1, 3, 2]
-        assert sample["t0", "knows", "t0"]["num_sampled_edges"] == [3, 5]
-
-    assert num_samples == 100
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_e2e_coo():
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    x = torch.randint(3000, (256, 256)).to(torch.float32)
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9999}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
-            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    convs = [
-        torch_geometric.nn.SAGEConv(256, 64, aggr="mean").cuda(),
-        torch_geometric.nn.SAGEConv(64, 8, aggr="mean").cuda(),
-        torch_geometric.nn.SAGEConv(8, 1, aggr="mean").cuda(),
-    ]
-
-    trim = trim_to_layer.TrimToLayer()
-    relu = torch.nn.functional.relu
-    dropout = torch.nn.functional.dropout
-
-    for hetero_data in loader:
-        ei = hetero_data["t0", "knows", "t0"]["edge_index"]
-        x = hetero_data["t0"]["x"].cuda()
-        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
-        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
-
-        for i in range(len(convs)):
-            x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
-
-            s = x.shape[0]
-
-            x = convs[i](x, ei, size=(s, s))
-            x = relu(x)
-            x = dropout(x, p=0.5)
-
-        x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])
-
-        assert list(x.shape) == [3, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
-@pytest.mark.parametrize("framework", ["pyg", "cugraph-ops"])
-@pytest.mark.sg
-def test_cugraph_loader_e2e_csc(framework: str):
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    x = torch.randint(3000, (256, 256)).to(torch.float32)
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9999}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "major_offsets": [0, 3, 5, 7, 8, None, None, None],
-            "minors": [1, 2, 3, 0, 3, 4, 5, 1],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "label_hop_offsets": cudf.Series(
-                [0, 1, 4, None, None, None, None, None], dtype="int32"
-            ),
-            "renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    if framework == "pyg":
-        convs = [
-            torch_geometric.nn.SAGEConv(256, 64, aggr="mean").cuda(),
-            torch_geometric.nn.SAGEConv(64, 1, aggr="mean").cuda(),
-        ]
-    else:
-        convs = [
-            CuGraphSAGEConv(256, 64, aggr="mean").cuda(),
-            CuGraphSAGEConv(64, 1, aggr="mean").cuda(),
-        ]
-
-    trim = trim_to_layer.TrimToLayer()
-    relu = torch.nn.functional.relu
-    dropout = torch.nn.functional.dropout
-
-    for hetero_data in loader:
-        x = hetero_data["t0"]["x"].cuda()
-
-        if framework == "pyg":
-            ei = hetero_data["t0", "knows", "t0"]["adj_t"].coo()
-            ei = torch.stack((ei[0], ei[1]))
-        else:
-            ei = hetero_data["t0", "knows", "t0"]["adj_t"].csr()
-            ei = [ei[1], ei[0], x.shape[0]]
-
-        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
-        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
-
-        s = x.shape[0]
-        for i in range(len(convs)):
-            if framework == "pyg":
-                x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
-            else:
-                if i > 0:
-                    x = x.narrow(
-                        dim=0,
-                        start=0,
-                        length=s - num_sampled_nodes[-i],
-                    )
-
-                    ei[0] = ei[0].narrow(
-                        dim=0,
-                        start=0,
-                        length=ei[0].size(0) - num_sampled_edges[-i],
-                    )
-                    ei[1] = ei[1].narrow(
-                        dim=0, start=0, length=ei[1].size(0) - num_sampled_nodes[-i]
-                    )
-                    ei[2] = x.size(0)
-
-            s = x.shape[0]
-
-            if framework == "pyg":
-                x = convs[i](x, ei, size=(s, s))
-            else:
-                x = convs[i](x, ei)
-            x = relu(x)
-            x = dropout(x, p=0.5)
-
-        x = x.narrow(dim=0, start=0, length=s - num_sampled_nodes[1])
-
-        assert list(x.shape) == [1, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize("drop_last", [True, False])
-@pytest.mark.sg
-def test_drop_last(drop_last):
-    N = {"N": 10}
-    G = {
-        ("N", "e", "N"): torch.stack(
-            [torch.tensor([0, 1, 2, 3, 4]), torch.tensor([5, 6, 7, 8, 9])]
-        )
-    }
-    F = FeatureStore(backend="torch")
-    F.add_data(torch.arange(10), "N", "z")
-
-    store = DaskGraphStore(F, G, N)
-    with tempfile.TemporaryDirectory() as dir:
-        loader = DaskNeighborLoader(
-            (store, store),
-            input_nodes=torch.tensor([0, 1, 2, 3, 4]),
-            num_neighbors=[1],
-            batch_size=2,
-            shuffle=False,
-            drop_last=drop_last,
-            batches_per_partition=1,
-            directory=dir,
-        )
-
-        t = torch.tensor([])
-        for batch in loader:
-            t = torch.concat([t, batch["N"].z])
-
-        t = t.tolist()
-
-        files = os.listdir(dir)
-        assert len(files) == 2 if drop_last else 3
-        assert "batch=0-0.parquet" in files
-        assert "batch=1-1.parquet" in files
-        if not drop_last:
-            assert "batch=2-2.parquet" in files
-
-
-@pytest.mark.parametrize("directory", ["local", "temp"])
-@pytest.mark.sg
-def test_load_directory(
-    karate_gnn: Tuple[
-        FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
-    ],
-    directory: str,
-):
-    if directory == "local":
-        local_dir = tempfile.TemporaryDirectory(dir=".")
-
-    cugraph_store = DaskGraphStore(*karate_gnn)
-    cugraph_loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        torch.arange(8, dtype=torch.int64),
-        2,
-        num_neighbors=[8, 4, 2],
-        random_state=62,
-        replace=False,
-        directory=None if directory == "temp" else local_dir.name,
-        batches_per_partition=1,
-    )
-
-    it = iter(cugraph_loader)
-    next_batch = next(it)
-    assert next_batch is not None
-
-    if directory == "local":
-        assert len(os.listdir(local_dir.name)) == 4
-
-    count = 1
-    while next(it, None) is not None:
-        count += 1
-
-    assert count == 4
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
deleted file mode 100644
index 9e8a85a5b67..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.loader import DaskNeighborLoader
-from cugraph_pyg.data import DaskGraphStore
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_cugraph_loader_basic(dask_client, karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
-        10,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    assert isinstance(cugraph_store._subgraph()._plc_graph, dict)
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 3
-    for sample in samples:
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_cugraph_loader_hetero(dask_client, karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
-        batch_size=2,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 2
-    for sample in samples:
-        print(sample)
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
deleted file mode 100644
index 8ee18a826f7..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import cugraph_pyg
-from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
-from cugraph_pyg.loader import NeighborLoader
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_loader():
-    """
-    Basic e2e test that covers loading and sampling.
-    """
-
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-
-    graph_store = GraphStore()
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (34, 16))
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [5, 5],
-        input_nodes=torch.arange(34),
-    )
-
-    for batch in loader:
-        assert isinstance(batch, torch_geometric.data.Data)
-        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_loader_biased():
-    eix = torch.tensor(
-        [
-            [3, 4, 5],
-            [0, 1, 2],
-        ]
-    )
-
-    graph_store = GraphStore()
-    graph_store.put_edge_index(eix, ("person", "knows", "person"), "coo")
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (6, 12))
-    feature_store[("person", "knows", "person"), "bias"] = torch.tensor(
-        [0, 12, 14], dtype=torch.float32
-    )
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [1],
-        input_nodes=torch.tensor([0, 1, 2], dtype=torch.int64),
-        batch_size=3,
-        weight_attr="bias",
-    )
-
-    out = list(iter(loader))
-    assert len(out) == 1
-    out = out[0]
-
-    assert out.edge_index.shape[1] == 2
-    assert (out.edge_index.cpu() == torch.tensor([[3, 4], [1, 2]])).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-@pytest.mark.parametrize("num_nodes", [10, 25])
-@pytest.mark.parametrize("num_edges", [64, 128])
-@pytest.mark.parametrize("batch_size", [2, 4])
-@pytest.mark.parametrize("select_edges", [16, 32])
-@pytest.mark.parametrize("depth", [1, 3])
-@pytest.mark.parametrize("num_neighbors", [1, 4])
-def test_link_neighbor_loader_basic(
-    num_nodes, num_edges, batch_size, select_edges, num_neighbors, depth
-):
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = cugraph_pyg.loader.LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[num_neighbors] * depth,
-        edge_label_index=elx,
-        batch_size=batch_size,
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert (
-            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
-        ).all()
-        assert (elx[i] == batch.n_id[batch.edge_label_index.cpu()]).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_link_neighbor_loader_negative_sampling_basic(batch_size):
-    num_edges = 62
-    num_nodes = 19
-    select_edges = 17
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = cugraph_pyg.loader.LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[3, 3, 3],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        neg_sampling="binary",
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert batch.edge_label[0] == 1.0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_link_neighbor_loader_negative_sampling_uneven(batch_size):
-    num_edges = 62
-    num_nodes = 19
-    select_edges = 17
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = cugraph_pyg.loader.LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[3, 3, 3],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        neg_sampling=torch_geometric.sampler.NegativeSampling("binary", amount=0.1),
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert batch.edge_label[0] == 1.0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
deleted file mode 100644
index d1dee01a508..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import os
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
-from cugraph_pyg.loader import NeighborLoader, LinkNeighborLoader
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-def init_pytorch_worker(rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-        pool_allocator=False,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-
-
-def run_test_neighbor_loader_mg(rank, uid, world_size, specify_size):
-    """
-    Basic e2e test that covers loading and sampling.
-    """
-    init_pytorch_worker(rank, world_size, uid)
-
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-    ei = torch.tensor_split(ei.clone(), world_size, axis=1)[rank]
-
-    sz = (34, 34) if specify_size else None
-    graph_store = GraphStore(is_multi_gpu=True)
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo", False, sz)
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (34, 16))
-
-    ix_train = torch.tensor_split(torch.arange(34), world_size, axis=0)[rank]
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [5, 5],
-        input_nodes=ix_train,
-    )
-
-    for batch in loader:
-        assert isinstance(batch, torch_geometric.data.Data)
-        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.parametrize("specify_size", [True, False])
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_loader_mg(specify_size):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_neighbor_loader_mg,
-        args=(
-            uid,
-            world_size,
-            specify_size,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_neighbor_loader_biased_mg(rank, uid, world_size):
-    init_pytorch_worker(rank, world_size, uid)
-
-    eix = torch.stack(
-        [
-            torch.arange(
-                3 * (world_size + rank),
-                3 * (world_size + rank + 1),
-                dtype=torch.int64,
-                device="cuda",
-            ),
-            torch.arange(3 * rank, 3 * (rank + 1), dtype=torch.int64, device="cuda"),
-        ]
-    )
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    graph_store.put_edge_index(eix, ("person", "knows", "person"), "coo")
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (6 * world_size, 12))
-    feature_store[("person", "knows", "person"), "bias"] = torch.concat(
-        [torch.tensor([0, 1, 1], dtype=torch.float32) for _ in range(world_size)]
-    )
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [1],
-        input_nodes=torch.arange(
-            3 * rank, 3 * (rank + 1), dtype=torch.int64, device="cuda"
-        ),
-        batch_size=3,
-        weight_attr="bias",
-    )
-
-    out = list(iter(loader))
-    assert len(out) == 1
-    out = out[0]
-
-    assert (
-        out.edge_index.cpu()
-        == torch.tensor(
-            [
-                [3, 4],
-                [1, 2],
-            ]
-        )
-    ).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_loader_biased_mg():
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_neighbor_loader_biased_mg,
-        args=(
-            uid,
-            world_size,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_link_neighbor_loader_basic_mg(
-    rank,
-    uid,
-    world_size,
-    num_nodes: int,
-    num_edges: int,
-    select_edges: int,
-    batch_size: int,
-    num_neighbors: int,
-    depth: int,
-):
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[num_neighbors] * depth,
-        edge_label_index=elx,
-        batch_size=batch_size,
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert (
-            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
-        ).all()
-        assert (elx[i] == batch.n_id[batch.edge_label_index.cpu()]).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-@pytest.mark.parametrize("select_edges", [64, 128])
-@pytest.mark.parametrize("batch_size", [2, 4])
-@pytest.mark.parametrize("depth", [1, 3])
-def test_link_neighbor_loader_basic_mg(select_edges, batch_size, depth):
-    num_nodes = 25
-    num_edges = 128
-    num_neighbors = 2
-
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_link_neighbor_loader_basic_mg,
-        args=(
-            uid,
-            world_size,
-            num_nodes,
-            num_edges,
-            select_edges,
-            batch_size,
-            num_neighbors,
-            depth,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_link_neighbor_loader_uneven_mg(rank, uid, world_size, edge_index):
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = TensorDictFeatureStore()
-
-    batch_size = 1
-    graph_store[("n", "e", "n"), "coo"] = torch.tensor_split(
-        edge_index, world_size, dim=-1
-    )[rank]
-
-    elx = graph_store[("n", "e", "n"), "coo"]  # select all edges on each worker
-    loader = LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[2, 2, 2],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        shuffle=False,
-    )
-
-    for i, batch in enumerate(loader):
-        assert (
-            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
-        ).all()
-
-        assert (elx[:, [i]] == batch.n_id[batch.edge_label_index.cpu()]).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_link_neighbor_loader_uneven_mg():
-    edge_index = torch.tensor(
-        [
-            [0, 1, 3, 4, 7],
-            [1, 0, 8, 9, 12],
-        ]
-    )
-
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_link_neighbor_loader_uneven_mg,
-        args=(
-            uid,
-            world_size,
-            edge_index,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_link_neighbor_loader_negative_sampling_basic_mg(
-    rank, world_size, uid, batch_size
-):
-    num_edges = 62 * world_size
-    num_nodes = 19 * world_size
-    select_edges = 17
-
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[3, 3, 3],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        neg_sampling="binary",
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert batch.edge_label[0] == 1.0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_link_neighbor_loader_negative_sampling_basic_mg(batch_size):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_link_neighbor_loader_negative_sampling_basic_mg,
-        args=(
-            world_size,
-            uid,
-            batch_size,
-        ),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
deleted file mode 100644
index 92d216fefa3..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import GATConv as CuGraphGATConv
-from cugraph_pyg.utils.imports import package_available
-
-ATOL = 1e-6
-
-
-@pytest.mark.skipif(
-    package_available("torch_geometric<2.5"), reason="Test requires pyg>=2.5"
-)
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_gat_conv_equality(
-    use_edge_index,
-    bias,
-    bipartite,
-    concat,
-    heads,
-    max_num_neighbors,
-    use_edge_attr,
-    graph,
-    request,
-):
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import GATConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphGATConv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphGATConv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(bias=bias, concat=concat, edge_dim=edge_dim)
-
-    conv1 = GATConv(
-        in_channels, out_channels, heads, add_self_loops=False, **kwargs
-    ).cuda()
-    conv2 = CuGraphGATConv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    out_dim = heads * out_channels
-    with torch.no_grad():
-        if bipartite:
-            conv2.lin_src.weight.copy_(conv1.lin_src.weight)
-            conv2.lin_dst.weight.copy_(conv1.lin_dst.weight)
-        else:
-            conv2.lin.weight.copy_(conv1.lin.weight)
-
-        conv2.att[:out_dim].copy_(conv1.att_src.flatten())
-        conv2.att[out_dim : 2 * out_dim].copy_(conv1.att_dst.flatten())
-        if use_edge_attr:
-            conv2.att[2 * out_dim :].copy_(conv1.att_edge.flatten())
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr, max_num_neighbors=max_num_neighbors)
-    else:
-        out2 = conv2(
-            x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors
-        )
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    if bipartite:
-        assert torch.allclose(
-            conv1.lin_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-        )
-    else:
-        assert torch.allclose(conv1.lin.weight.grad, conv2.lin.weight.grad, atol=ATOL)
-
-    assert torch.allclose(
-        conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.att_dst.grad.flatten(), conv2.att.grad[out_dim : 2 * out_dim], atol=ATOL
-    )
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.att_edge.grad.flatten(), conv2.att.grad[2 * out_dim :], atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
-
-    if bias:
-        assert torch.allclose(conv1.bias.grad, conv2.bias.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
deleted file mode 100644
index 2e221922add..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import GATv2Conv as CuGraphGATv2Conv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_gatv2_conv_equality(
-    use_edge_index, bipartite, concat, heads, use_edge_attr, graph, request
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import GATv2Conv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphGATv2Conv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(bias=False, concat=concat, edge_dim=edge_dim)
-
-    conv1 = GATv2Conv(
-        in_channels, out_channels, heads, add_self_loops=False, **kwargs
-    ).cuda()
-    conv2 = CuGraphGATv2Conv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    with torch.no_grad():
-        conv2.lin_src.weight.copy_(conv1.lin_l.weight)
-        conv2.lin_dst.weight.copy_(conv1.lin_r.weight)
-        conv2.att.copy_(conv1.att.flatten())
-        if use_edge_attr:
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr)
-    else:
-        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    assert torch.allclose(conv1.lin_l.weight.grad, conv2.lin_src.weight.grad, atol=ATOL)
-    assert torch.allclose(conv1.lin_r.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL)
-
-    assert torch.allclose(conv1.att.grad.flatten(), conv2.att.grad, atol=ATOL)
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
deleted file mode 100644
index f182869002a..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
-from cugraph_pyg.utils.imports import package_available
-
-ATOL = 1e-6
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(
-    package_available("torch_geometric<2.4"), reason="Test requires pyg>=2.4"
-)
-@pytest.mark.parametrize("heads", [1, 3, 10])
-@pytest.mark.parametrize("aggr", ["sum", "mean"])
-@pytest.mark.sg
-def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
-    import torch
-    from torch_geometric.data import HeteroData
-    from torch_geometric.nn import HeteroConv, GATConv
-
-    device = torch.device("cuda")
-    data = HeteroData(sample_pyg_hetero_data).to(device)
-
-    in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
-    out_channels = 2
-
-    convs_dict = {}
-    kwargs1 = dict(heads=heads, add_self_loops=False, bias=False)
-    for edge_type in data.edge_types:
-        src_t, _, dst_t = edge_type
-        in_channels_src, in_channels_dst = data.x_dict[src_t].size(-1), data.x_dict[
-            dst_t
-        ].size(-1)
-        if src_t == dst_t:
-            convs_dict[edge_type] = GATConv(in_channels_src, out_channels, **kwargs1)
-        else:
-            convs_dict[edge_type] = GATConv(
-                (in_channels_src, in_channels_dst), out_channels, **kwargs1
-            )
-
-    conv1 = HeteroConv(convs_dict, aggr=aggr).to(device)
-    kwargs2 = dict(
-        heads=heads,
-        aggr=aggr,
-        node_types=data.node_types,
-        edge_types=data.edge_types,
-        bias=False,
-    )
-    conv2 = CuGraphHeteroGATConv(in_channels_dict, out_channels, **kwargs2).to(device)
-
-    # copy over linear and attention weights
-    w_src, w_dst = conv2.split_tensors(conv2.lin_weights, dim=0)
-    with torch.no_grad():
-        for edge_type in conv2.edge_types:
-            src_t, _, dst_t = edge_type
-            if src_t == dst_t:
-                w_src[edge_type].copy_(conv1.convs[edge_type].lin.weight)
-            else:
-                w_src[edge_type].copy_(conv1.convs[edge_type].lin_src.weight)
-                if w_dst[edge_type] is not None:
-                    w_dst[edge_type].copy_(conv1.convs[edge_type].lin_dst.weight)
-
-            conv2.attn_weights[edge_type][: heads * out_channels].copy_(
-                conv1.convs[edge_type].att_src.flatten()
-            )
-            conv2.attn_weights[edge_type][heads * out_channels :].copy_(
-                conv1.convs[edge_type].att_dst.flatten()
-            )
-
-    out1 = conv1(data.x_dict, data.edge_index_dict)
-    out2 = conv2(data.x_dict, data.edge_index_dict)
-
-    for node_type in data.node_types:
-        assert torch.allclose(out1[node_type], out2[node_type], atol=ATOL)
-
-    loss1 = 0
-    loss2 = 0
-    for node_type in data.node_types:
-        loss1 += out1[node_type].mean()
-        loss2 += out2[node_type].mean()
-
-    loss1.backward()
-    loss2.backward()
-
-    # check gradient w.r.t attention weights
-    out_dim = heads * out_channels
-    for edge_type in conv2.edge_types:
-        assert torch.allclose(
-            conv1.convs[edge_type].att_src.grad.flatten(),
-            conv2.attn_weights[edge_type].grad[:out_dim],
-            atol=ATOL,
-        )
-        assert torch.allclose(
-            conv1.convs[edge_type].att_dst.grad.flatten(),
-            conv2.attn_weights[edge_type].grad[out_dim:],
-            atol=ATOL,
-        )
-
-    # check gradient w.r.t linear weights
-    grad_lin_weights_ref = dict.fromkeys(out1.keys())
-    for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
-        grad_list = []
-        for rel_t in rels_as_src:
-            src_type, _, dst_type = rel_t
-            if src_type == dst_type:
-                grad_list.append(conv1.convs[rel_t].lin.weight.grad.clone())
-            else:
-                grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
-        for rel_t in rels_as_dst:
-            grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
-        assert len(grad_list) > 0
-        grad_lin_weights_ref[node_t] = torch.vstack(grad_list)
-
-    for node_type in conv2.lin_weights:
-        assert torch.allclose(
-            grad_lin_weights_ref[node_type],
-            conv2.lin_weights[node_type].grad,
-            atol=ATOL,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
deleted file mode 100644
index 8b06cb2e180..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import RGCNConv as CuGraphRGCNConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("aggr", ["add", "sum", "mean"])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("num_bases", [1, 2, None])
-@pytest.mark.parametrize("root_weight", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_rgcn_conv_equality(
-    use_edge_index,
-    aggr,
-    bias,
-    max_num_neighbors,
-    num_bases,
-    root_weight,
-    graph,
-    request,
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import FastRGCNConv as RGCNConv
-
-    torch.manual_seed(12345)
-    in_channels, out_channels, num_relations = (4, 2, 3)
-    kwargs = dict(aggr=aggr, bias=bias, num_bases=num_bases, root_weight=root_weight)
-
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-    edge_type = torch.randint(num_relations, (edge_index.size(1),)).cuda()
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
-
-    x = torch.rand(size[0], in_channels, device="cuda")
-
-    conv1 = RGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
-    conv2 = CuGraphRGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
-
-    with torch.no_grad():
-        if root_weight:
-            conv2.weight[:-1].copy_(conv1.weight)
-            conv2.weight[-1].copy_(conv1.root)
-        else:
-            conv2.weight.copy_(conv1.weight)
-        if num_bases is not None:
-            conv2.comp.copy_(conv1.comp)
-
-    out1 = conv1(x, edge_index, edge_type)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_type)
-    else:
-        out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.rand_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    if root_weight:
-        assert torch.allclose(conv1.weight.grad, conv2.weight.grad[:-1], atol=ATOL)
-        assert torch.allclose(conv1.root.grad, conv2.weight.grad[-1], atol=ATOL)
-    else:
-        assert torch.allclose(conv1.weight.grad, conv2.weight.grad, atol=ATOL)
-
-    if num_bases is not None:
-        assert torch.allclose(conv1.comp.grad, conv2.comp.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
deleted file mode 100644
index 878ceff632a..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("aggr", ["sum", "mean", "min", "max"])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("normalize", [True, False])
-@pytest.mark.parametrize("root_weight", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_sage_conv_equality(
-    use_edge_index,
-    aggr,
-    bias,
-    bipartite,
-    max_num_neighbors,
-    normalize,
-    root_weight,
-    graph,
-    request,
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import SAGEConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        csc = CuGraphSAGEConv.to_csc(edge_index, size)
-
-    if bipartite:
-        in_channels = (7, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 4
-
-    kwargs = dict(aggr=aggr, bias=bias, normalize=normalize, root_weight=root_weight)
-
-    conv1 = SAGEConv(in_channels, out_channels, **kwargs).cuda()
-    conv2 = CuGraphSAGEConv(in_channels, out_channels, **kwargs).cuda()
-
-    in_channels_src = conv2.in_channels_src
-    with torch.no_grad():
-        conv2.lin.weight[:, :in_channels_src].copy_(conv1.lin_l.weight)
-        if root_weight:
-            conv2.lin.weight[:, in_channels_src:].copy_(conv1.lin_r.weight)
-        if bias:
-            conv2.lin.bias.copy_(conv1.lin_l.bias)
-
-    out1 = conv1(x, edge_index)
-    out2 = conv2(x, csc, max_num_neighbors=max_num_neighbors)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.rand_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    assert torch.allclose(
-        conv1.lin_l.weight.grad,
-        conv2.lin.weight.grad[:, :in_channels_src],
-        atol=ATOL,
-    )
-
-    if root_weight:
-        assert torch.allclose(
-            conv1.lin_r.weight.grad,
-            conv2.lin.weight.grad[:, in_channels_src:],
-            atol=ATOL,
-        )
-
-    if bias:
-        assert torch.allclose(
-            conv1.lin_l.bias.grad,
-            conv2.lin.bias.grad,
-            atol=ATOL,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
deleted file mode 100644
index d207a4d7947..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import TransformerConv as CuGraphTransformerConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_transformer_conv_equality(
-    use_edge_index, use_edge_attr, bipartite, concat, heads, graph, request
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import TransformerConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0], device="cuda"),
-            torch.rand(size[1], in_channels[1], device="cuda"),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels, device="cuda")
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphTransformerConv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphTransformerConv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(concat=concat, bias=False, edge_dim=edge_dim, root_weight=False)
-
-    conv1 = TransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
-    conv2 = CuGraphTransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    with torch.no_grad():
-        conv2.lin_query.weight.copy_(conv1.lin_query.weight)
-        conv2.lin_key.weight.copy_(conv1.lin_key.weight)
-        conv2.lin_value.weight.copy_(conv1.lin_value.weight)
-        conv2.lin_query.bias.copy_(conv1.lin_query.bias)
-        conv2.lin_key.bias.copy_(conv1.lin_key.bias)
-        conv2.lin_value.bias.copy_(conv1.lin_value.bias)
-        if use_edge_attr:
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr)
-    else:
-        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    assert torch.allclose(
-        conv1.lin_query.weight.grad, conv2.lin_query.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_key.weight.grad, conv2.lin_key.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_value.weight.grad, conv2.lin_value.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_query.bias.grad, conv2.lin_query.bias.grad, atol=ATOL
-    )
-    assert torch.allclose(conv1.lin_key.bias.grad, conv2.lin_key.bias.grad, atol=ATOL)
-    assert torch.allclose(
-        conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=ATOL
-    )
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini b/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
deleted file mode 100644
index 7659fdc386f..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import cupy
-
-import pytest
-
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.sampler_utils import (
-    _sampler_output_from_sampling_results_heterogeneous,
-)
-
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph import uniform_neighbor_sample
-
-torch = import_optional("torch")
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_sample(basic_graph_1):
-    F, G, N = basic_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = uniform_neighbor_sample(
-        cugraph_store._subgraph(),
-        batches,
-        fanout_vals=[-1],
-        with_replacement=False,
-        with_edge_properties=True,
-        with_batch_ids=True,
-        random_state=62,
-        return_offsets=False,
-        use_legacy_names=False,
-    ).sort_values(by=["majors", "minors"])
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert sorted(node_ids.tolist()) == actual_vertex_ids.tolist()
-
-    assert (
-        row_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][0].tolist()
-    )
-    assert (
-        col_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][1].tolist()
-    )
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"] == [4, 1]
-
-    assert len(out.num_sampled_edges) == 1
-    assert out.num_sampled_edges[("vt1", "pig", "vt1")] == [6]
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = uniform_neighbor_sample(
-        cugraph_store._subgraph(),
-        batches,
-        fanout_vals=[-1],
-        with_replacement=False,
-        with_edge_properties=True,
-        random_state=62,
-        return_offsets=False,
-        with_batch_ids=True,
-        use_legacy_names=False,
-    ).sort_values(by=["majors", "minors"])
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert node_ids.tolist() == sorted(actual_vertex_ids.tolist())
-
-    for edge_type, ei in G.items():
-        assert sorted(row_dict[edge_type].tolist()) == sorted(ei[0].tolist())
-        assert sorted(col_dict[edge_type].tolist()) == sorted(ei[1].tolist())
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"] == [2, 0]
-    assert out.num_sampled_nodes["brown"] == [3, 0]
-
-    assert len(out.num_sampled_edges) == 5
-    assert out.num_sampled_edges[("brown", "horse", "brown")] == [2]
-    assert out.num_sampled_edges[("brown", "tortoise", "black")] == [3]
-    assert out.num_sampled_edges[("brown", "mongoose", "black")] == [2]
-    assert out.num_sampled_edges[("black", "cow", "brown")] == [2]
-    assert out.num_sampled_edges[("black", "snake", "black")] == [1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_sample_mock_sampling_results(abc_graph):
-    F, G, N = abc_graph
-
-    graph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        mock_sampling_results, None, graph_store, None
-    )
-
-    assert out.metadata is None
-    assert len(out.node) == 3
-    assert out.node["A"].tolist() == [0, 1]
-    assert out.node["B"].tolist() == [0, 1]
-    assert out.node["C"].tolist() == [3, 2, 0]
-
-    assert len(out.row) == 3
-    assert len(out.col) == 3
-    assert out.row[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert out.col[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert out.row[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert out.col[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert out.row[("B", "ba", "A")].tolist() == [1, 1]
-    assert out.col[("B", "ba", "A")].tolist() == [1, 1]
-
-    assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"] == [2, 0, 0, 0, 0]
-    assert out.num_sampled_nodes["B"] == [0, 2, 0, 0, 0]
-    assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
-
-    assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
deleted file mode 100644
index 91e0668b3c1..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import cupy
-
-import pytest
-
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.sampler_utils import (
-    _sampler_output_from_sampling_results_heterogeneous,
-)
-
-from cugraph.gnn import FeatureStore
-
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph.dask import uniform_neighbor_sample
-
-torch = import_optional("torch")
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_sample(dask_client, basic_graph_1):
-    F, G, N = basic_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = (
-        uniform_neighbor_sample(
-            cugraph_store._subgraph(),
-            batches,
-            with_batch_ids=True,
-            fanout_vals=[-1],
-            with_replacement=False,
-            with_edge_properties=True,
-            random_state=62,
-            return_offsets=False,
-            return_hops=True,
-            use_legacy_names=False,
-        )
-        .compute()
-        .sort_values(by=["majors", "minors"])
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert sorted(node_ids.tolist()) == actual_vertex_ids.tolist()
-
-    assert (
-        row_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][0].tolist()
-    )
-    assert (
-        col_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][1].tolist()
-    )
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"] == [4, 1]
-
-    assert len(out.num_sampled_edges) == 1
-    assert out.num_sampled_edges[("vt1", "pig", "vt1")] == [6]
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skip(reason="broken")
-@pytest.mark.mg
-def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batches": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = (
-        uniform_neighbor_sample(
-            cugraph_store._subgraph(),
-            batches,
-            fanout_vals=[-1],
-            with_replacement=False,
-            with_edge_properties=True,
-            random_state=62,
-            return_offsets=False,
-            with_batch_ids=True,
-            use_legacy_names=False,
-        )
-        .sort_values(by=["majors", "minors"])
-        .compute()
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert node_ids.tolist() == sorted(actual_vertex_ids.tolist())
-
-    for edge_type, ei in G.items():
-        assert sorted(row_dict[edge_type].tolist()) == sorted(ei[0].tolist())
-        assert sorted(col_dict[edge_type].tolist()) == sorted(ei[1].tolist())
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"].tolist() == [2, 0]
-    assert out.num_sampled_nodes["brown"].tolist() == [3, 0]
-
-    assert len(out.num_sampled_edges) == 5
-    assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
-    assert out.num_sampled_edges[("brown", "tortoise", "black")].tolist() == [3]
-    assert out.num_sampled_edges[("brown", "mongoose", "black")].tolist() == [2]
-    assert out.num_sampled_edges[("black", "cow", "brown")].tolist() == [2]
-    assert out.num_sampled_edges[("black", "snake", "black")].tolist() == [1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_sample_mock_sampling_results(dask_client):
-    N = {
-        "A": 2,  # 0, 1
-        "B": 3,  # 2, 3, 4
-        "C": 4,  # 5, 6, 7, 8
-    }
-
-    G = {
-        # (0->2, 0->3, 1->3)
-        ("A", "ab", "B"): [
-            torch.tensor([0, 0, 1], dtype=torch.int64),
-            torch.tensor([0, 1, 1], dtype=torch.int64),
-        ],
-        # (2->0, 2->1, 3->1, 4->0)
-        ("B", "ba", "A"): [
-            torch.tensor([0, 0, 1, 2], dtype=torch.int64),
-            torch.tensor([0, 1, 1, 0], dtype=torch.int64),
-        ],
-        # (2->6, 2->8, 3->5, 3->7, 4->5, 4->8)
-        ("B", "bc", "C"): [
-            torch.tensor([0, 0, 1, 1, 2, 2], dtype=torch.int64),
-            torch.tensor([1, 3, 0, 2, 0, 3], dtype=torch.int64),
-        ],
-    }
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
-    )
-
-    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        mock_sampling_results, None, graph_store, None
-    )
-
-    assert out.metadata is None
-    assert len(out.node) == 3
-    assert out.node["A"].tolist() == [0, 1]
-    assert out.node["B"].tolist() == [0, 1]
-    assert out.node["C"].tolist() == [3, 2, 0]
-
-    assert len(out.row) == 3
-    assert len(out.col) == 3
-    assert out.row[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert out.col[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert out.row[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert out.col[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert out.row[("B", "ba", "A")].tolist() == [1, 1]
-    assert out.col[("B", "ba", "A")].tolist() == [1, 1]
-
-    assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"] == [2, 0, 0, 0, 0]
-    assert out.num_sampled_nodes["B"] == [0, 2, 0, 0, 0]
-    assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
-
-    assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_version.py b/python/cugraph-pyg/cugraph_pyg/tests/test_version.py
deleted file mode 100644
index 4ea0f9875f5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_version.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import cugraph_pyg
-
-
-def test_version_constants_are_populated():
-    # __git_commit__ will only be non-empty in a built distribution
-    assert isinstance(cugraph_pyg.__git_commit__, str)
-
-    # __version__ should always be non-empty
-    assert isinstance(cugraph_pyg.__version__, str)
-    assert len(cugraph_pyg.__version__) > 0
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/__init__.py b/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
deleted file mode 100644
index aeae6078111..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/imports.py b/python/cugraph-pyg/cugraph_pyg/utils/imports.py
deleted file mode 100644
index 1cc865a1f35..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/utils/imports.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from packaging.requirements import Requirement
-from importlib import import_module
-
-
-def package_available(requirement: str) -> bool:
-    """Check if a package is installed and meets the version requirement."""
-    req = Requirement(requirement)
-    try:
-        pkg = import_module(req.name)
-    except ImportError:
-        return False
-
-    if len(req.specifier) > 0:
-        if hasattr(pkg, "__version__"):
-            return pkg.__version__ in req.specifier
-        else:
-            return False
-
-    return True
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
deleted file mode 100644
index 1070623e718..00000000000
--- a/python/cugraph-pyg/pyproject.toml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-[build-system]
-
-requires = [
-    "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "setuptools>=61.0.0",
-    "wheel",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "rapids_build_backend.build"
-
-[tool.pytest.ini_options]
-testpaths = ["cugraph_pyg/tests"]
-
-[project]
-name = "cugraph-pyg"
-dynamic = ["version"]
-description = "cugraph-pyg - PyG support for cuGraph massive-scale, ultra-fast GPU graph analytics."
-authors = [
-    { name = "NVIDIA Corporation" },
-]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.10"
-classifiers = [
-    "Intended Audience :: Developers",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-]
-dependencies = [
-    "cugraph==25.2.*,>=0.0.0a0",
-    "numba>=0.57",
-    "numpy>=1.23,<3.0a0",
-    "pylibcugraphops==25.2.*,>=0.0.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.urls]
-Homepage = "https://github.com/rapidsai/cugraph"
-Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
-
-[project.optional-dependencies]
-test = [
-    "pandas",
-    "pylibwholegraph==25.2.*,>=0.0.0a0",
-    "pytest",
-    "pytest-benchmark",
-    "pytest-cov",
-    "pytest-xdist",
-    "scipy",
-    "tensordict>=0.1.2",
-    "torch>=2.3,<2.4.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cugraph_pyg/VERSION"}
-
-[tool.setuptools.packages.find]
-include = [
-    "cugraph_pyg*",
-    "cugraph_pyg.*",
-]
-
-[tool.rapids-build-backend]
-build-backend = "setuptools.build_meta"
-dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-pyg/pytest.ini b/python/cugraph-pyg/pytest.ini
deleted file mode 100644
index 07c4ffa0958..00000000000
--- a/python/cugraph-pyg/pytest.ini
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-[pytest]
-addopts =
-           --benchmark-warmup=off
-           --benchmark-max-time=0
-           --benchmark-min-rounds=1
-           --benchmark-columns="mean, rounds"
-           --tb=native
-           ## do not run slow tests/benchmarks by default
-           -m "not slow"
-
-markers =
-          slow: slow-running tests/benchmarks
-          cugraph_ops: Tests requiring cugraph-ops
-          mg: Test MG code paths - number of gpu > 1
-          sg: Test SG code paths and dask sg tests - number of gpu == 1
-
-python_classes =
-          Bench*
-          Test*
-
-python_files =
-          bench_*
-          test_*
-
-python_functions =
-          bench_*
-          test_*
diff --git a/python/cugraph/cugraph/centrality/betweenness_centrality.py b/python/cugraph/cugraph/centrality/betweenness_centrality.py
index dd47b1e8df0..00a9972efba 100644
--- a/python/cugraph/cugraph/centrality/betweenness_centrality.py
+++ b/python/cugraph/cugraph/centrality/betweenness_centrality.py
@@ -58,6 +58,11 @@ def betweenness_centrality(
         Algorithm (2001) to compute exact or approximate betweenness.
         If weights are provided in the edgelist, they will not be used.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     k : int, list or cudf object or None, optional (default=None)
         If k is not None, use k node samples to estimate betweenness. Higher
         values give better approximation.  If k is either a list, a cudf DataFrame,
@@ -224,6 +229,11 @@ def edge_betweenness_centrality(
         The current implementation uses BFS traversals. Use weight parameter
         if weights need to be considered (currently not supported).
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     k : int or list or None, optional (default=None)
         If k is not None, use k node samples to estimate betweenness. Higher
         values give better approximation.  If k is either a list, a cudf DataFrame,
diff --git a/python/cugraph/cugraph/centrality/degree_centrality.py b/python/cugraph/cugraph/centrality/degree_centrality.py
index 12d39f4127e..3e5e8842cf5 100644
--- a/python/cugraph/cugraph/centrality/degree_centrality.py
+++ b/python/cugraph/cugraph/centrality/degree_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -28,6 +28,11 @@ def degree_centrality(G, normalized=True):
         cuGraph graph descriptor with connectivity information. The graph can
         contain either directed or undirected edges.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     normalized : bool, optional, default=True
         If True normalize the resulting degree centrality values
 
diff --git a/python/cugraph/cugraph/centrality/eigenvector_centrality.py b/python/cugraph/cugraph/centrality/eigenvector_centrality.py
index 6be797096fc..d902a0dc947 100644
--- a/python/cugraph/cugraph/centrality/eigenvector_centrality.py
+++ b/python/cugraph/cugraph/centrality/eigenvector_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -37,6 +37,11 @@ def eigenvector_centrality(G, max_iter=100, tol=1.0e-6):
         cuGraph graph descriptor with connectivity information. The graph can
         contain either directed or undirected edges.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     max_iter : int, optional (default=100)
         The maximum number of iterations before an answer is returned. This can
         be used to limit the execution time and do an early exit before the
diff --git a/python/cugraph/cugraph/centrality/katz_centrality.py b/python/cugraph/cugraph/centrality/katz_centrality.py
index d902f9b06c9..1a44582d90e 100644
--- a/python/cugraph/cugraph/centrality/katz_centrality.py
+++ b/python/cugraph/cugraph/centrality/katz_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -48,6 +48,11 @@ def katz_centrality(
         cuGraph graph descriptor with connectivity information. The graph can
         contain either directed or undirected edges.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     alpha : float, optional (default=None)
         Attenuation factor defaulted to None. If alpha is not specified then
         it is internally calculated as 1/(degree_max) where degree_max is the
diff --git a/python/cugraph/cugraph/community/egonet.py b/python/cugraph/cugraph/community/egonet.py
index 56ae8ce70cc..e2a0833f6cb 100644
--- a/python/cugraph/cugraph/community/egonet.py
+++ b/python/cugraph/cugraph/community/egonet.py
@@ -62,6 +62,11 @@ def ego_graph(G, n, radius=1, center=True, undirected=None, distance=None):
         information. Edge weights, if present, should be single or double
         precision floating point values.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     n : integer or list, cudf.Series, cudf.DataFrame
         A single node as integer or a cudf.DataFrame if nodes are
         represented with multiple columns. If a cudf.DataFrame is provided,
diff --git a/python/cugraph/cugraph/community/induced_subgraph.py b/python/cugraph/cugraph/community/induced_subgraph.py
index c10241afa61..64d5ab09531 100644
--- a/python/cugraph/cugraph/community/induced_subgraph.py
+++ b/python/cugraph/cugraph/community/induced_subgraph.py
@@ -71,6 +71,11 @@ def induced_subgraph(
     G : cugraph.Graph or networkx.Graph
         The current implementation only supports weighted graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     vertices : cudf.Series or cudf.DataFrame
         Specifies the vertices of the induced subgraph. For multi-column
         vertices, vertices should be provided as a cudf.DataFrame
diff --git a/python/cugraph/cugraph/community/ktruss_subgraph.py b/python/cugraph/cugraph/community/ktruss_subgraph.py
index bcf8527e17b..6535592856d 100644
--- a/python/cugraph/cugraph/community/ktruss_subgraph.py
+++ b/python/cugraph/cugraph/community/ktruss_subgraph.py
@@ -50,6 +50,11 @@ def k_truss(
         defined for only undirected graphs as they are defined for
         undirected triangle in a graph.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     k : int
         The desired k to be used for extracting the k-truss subgraph.
 
diff --git a/python/cugraph/cugraph/community/leiden.py b/python/cugraph/cugraph/community/leiden.py
index d2a1a413d7b..6abedcac955 100644
--- a/python/cugraph/cugraph/community/leiden.py
+++ b/python/cugraph/cugraph/community/leiden.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -56,6 +56,11 @@ def leiden(
 
         The adjacency list will be computed if not already present.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     max_iter : integer, optional (default=100)
         This controls the maximum number of levels/iterations of the Leiden
         algorithm. When specified the algorithm will terminate after no more
diff --git a/python/cugraph/cugraph/community/louvain.py b/python/cugraph/cugraph/community/louvain.py
index 0bedd427824..5c70a900b73 100644
--- a/python/cugraph/cugraph/community/louvain.py
+++ b/python/cugraph/cugraph/community/louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -62,6 +62,11 @@ def louvain(
         present.
         The current implementation only supports undirected graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     max_level : integer, optional (default=100)
         This controls the maximum number of levels of the Louvain
         algorithm. When specified the algorithm will terminate after no more
diff --git a/python/cugraph/cugraph/community/spectral_clustering.py b/python/cugraph/cugraph/community/spectral_clustering.py
index 8b4dbce830f..418e08e96d7 100644
--- a/python/cugraph/cugraph/community/spectral_clustering.py
+++ b/python/cugraph/cugraph/community/spectral_clustering.py
@@ -45,6 +45,11 @@ def spectralBalancedCutClustering(
     G : cugraph.Graph or networkx.Graph
         Graph descriptor
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     num_clusters : integer
         Specifies the number of clusters to find, must be greater than 1
 
@@ -142,6 +147,11 @@ def spectralModularityMaximizationClustering(
     G : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor. This graph should have edge weights.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     num_clusters : integer
         Specifies the number of clusters to find
 
@@ -233,6 +243,11 @@ def analyzeClustering_modularity(
     G : cugraph.Graph or networkx.Graph
         graph descriptor. This graph should have edge weights.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     n_clusters : integer
         Specifies the number of clusters in the given clustering
 
diff --git a/python/cugraph/cugraph/community/subgraph_extraction.py b/python/cugraph/cugraph/community/subgraph_extraction.py
index 43169051be4..e49e681c096 100644
--- a/python/cugraph/cugraph/community/subgraph_extraction.py
+++ b/python/cugraph/cugraph/community/subgraph_extraction.py
@@ -46,6 +46,11 @@ def subgraph(
     G : cugraph.Graph or networkx.Graph
         The current implementation only supports weighted graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     vertices : cudf.Series or cudf.DataFrame
         Specifies the vertices of the induced subgraph. For multi-column
         vertices, vertices should be provided as a cudf.DataFrame
diff --git a/python/cugraph/cugraph/community/triangle_count.py b/python/cugraph/cugraph/community/triangle_count.py
index 247327b6e4c..eb2ee1465bc 100644
--- a/python/cugraph/cugraph/community/triangle_count.py
+++ b/python/cugraph/cugraph/community/triangle_count.py
@@ -54,6 +54,11 @@ def triangle_count(G, start_list=None):
         (edge weights are not used in this algorithm).
         The current implementation only supports undirected graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     start_list : list or cudf.Series
         list of vertices for triangle count. if None the entire set of vertices
         in the graph is processed
diff --git a/python/cugraph/cugraph/components/connectivity.py b/python/cugraph/cugraph/components/connectivity.py
index 45dba37d2ce..00bf443278f 100644
--- a/python/cugraph/cugraph/components/connectivity.py
+++ b/python/cugraph/cugraph/components/connectivity.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -115,6 +115,11 @@ def weakly_connected_components(G, directed=None, connection=None, return_labels
         The adjacency list will be computed if not already present. The number
         of vertices should fit into a 32b int.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     directed : bool, optional (default=None)
 
         NOTE
@@ -224,6 +229,11 @@ def strongly_connected_components(
         The adjacency list will be computed if not already present.  The number
         of vertices should fit into a 32b int.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     directed : bool, optional (default=True)
 
         NOTE
@@ -330,6 +340,11 @@ def connected_components(G, directed=None, connection="weak", return_labels=None
         The adjacency list will be computed if not already present.  The number
         of vertices should fit into a 32b int.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     directed : bool, optional (default=True)
 
         NOTE
diff --git a/python/cugraph/cugraph/cores/core_number.py b/python/cugraph/cugraph/cores/core_number.py
index 3e6cbe0d96f..0b411c2eed2 100644
--- a/python/cugraph/cugraph/cores/core_number.py
+++ b/python/cugraph/cugraph/cores/core_number.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -37,6 +37,11 @@ def core_number(G, degree_type="bidirectional"):
         of the core numbers.
         The current implementation only supports undirected graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     degree_type: str, (default="bidirectional")
         This option determines if the core number computation should be based
         on input, output, or both directed edges, with valid values being
diff --git a/python/cugraph/cugraph/cores/k_core.py b/python/cugraph/cugraph/cores/k_core.py
index 3dbc1cfa377..6c8e20b0ea8 100644
--- a/python/cugraph/cugraph/cores/k_core.py
+++ b/python/cugraph/cugraph/cores/k_core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -55,6 +55,11 @@ def k_core(G, k=None, core_number=None, degree_type="bidirectional"):
         weights, they don't participate in the calculation of the k-core.
         The current implementation only supports undirected graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     k : int, optional (default=None)
         Order of the core. This value must not be negative. If set to None, the
         main core is returned.
diff --git a/python/cugraph/cugraph/link_analysis/pagerank.py b/python/cugraph/cugraph/link_analysis/pagerank.py
index ef0705c6be9..40ae14e76dd 100644
--- a/python/cugraph/cugraph/link_analysis/pagerank.py
+++ b/python/cugraph/cugraph/link_analysis/pagerank.py
@@ -107,6 +107,11 @@ def pagerank(
         as an edge list.
         The transposed adjacency list will be computed if not already present.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     alpha : float, optional (default=0.85)
         The damping factor alpha represents the probability to follow an
         outgoing edge, standard value is 0.85.
diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py
index 9dce0e96f8c..71ef88b78b4 100644
--- a/python/cugraph/cugraph/link_prediction/cosine.py
+++ b/python/cugraph/cugraph/link_prediction/cosine.py
@@ -190,6 +190,11 @@ def cosine_coefficient(
 
         This implementation only supports undirected, non-multi Graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
         vertices or iterable of 2-tuples (u, v) where u and v are nodes in
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
index 214d92a1be5..77eb6447e85 100644
--- a/python/cugraph/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -190,6 +190,11 @@ def jaccard_coefficient(
 
         This implementation only supports undirected, non-multi Graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
         vertices or iterable of 2-tuples (u, v) where u and v are nodes in
diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py
index 52697d6b552..47c7765fb0f 100644
--- a/python/cugraph/cugraph/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/link_prediction/overlap.py
@@ -75,6 +75,11 @@ def overlap_coefficient(
 
         This implementation only supports undirected, non-multi edge Graph.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
         vertices or iterable of 2-tuples (u, v) where u and v are nodes in
diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
index 8030234993b..d8327bf25a9 100644
--- a/python/cugraph/cugraph/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -205,6 +205,11 @@ def sorensen_coefficient(
 
         This implementation only supports undirected, non-multi Graphs.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
         vertices or iterable of 2-tuples (u, v) where u and v are nodes in
diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py
index 71fc2969f86..eb91bfec82b 100644
--- a/python/cugraph/cugraph/sampling/node2vec.py
+++ b/python/cugraph/cugraph/sampling/node2vec.py
@@ -60,6 +60,11 @@ def node2vec(G, start_vertices, max_depth=1, compress_result=True, p=1.0, q=1.0)
         The graph can be either directed or undirected.
         Weights in the graph are ignored.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     start_vertices: int or list or cudf.Series or cudf.DataFrame
         A single node or a list or a cudf.Series of nodes from which to run
         the random walks. In case of multi-column vertices it should be
diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py
index 1bd7394164f..1c56dbbe324 100644
--- a/python/cugraph/cugraph/sampling/random_walks.py
+++ b/python/cugraph/cugraph/sampling/random_walks.py
@@ -72,6 +72,11 @@ def random_walks(
     G : cuGraph.Graph or networkx.Graph
         The graph can be either directed or undirected.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     random_walks_type : str, optional (default='uniform')
         Type of random walks: 'uniform', 'biased', 'node2vec'.
         Only 'uniform' random walks is currently supported
diff --git a/python/cugraph/cugraph/structure/graph_classes.py b/python/cugraph/cugraph/structure/graph_classes.py
index 84234f7e904..90f809fa6c1 100644
--- a/python/cugraph/cugraph/structure/graph_classes.py
+++ b/python/cugraph/cugraph/structure/graph_classes.py
@@ -115,7 +115,6 @@ def from_cudf_edgelist(
         edge_type=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
         symmetrize=None,
     ):
         """
@@ -168,13 +167,6 @@ def from_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
-
         symmetrize: bool, optional (default=None)
             If True, symmetrize the edge list for an undirected graph. Setting
             this flag to True for a directed graph returns an error. The default
@@ -210,7 +202,6 @@ def from_cudf_edgelist(
             edge_type=edge_type,
             renumber=renumber,
             store_transposed=store_transposed,
-            legacy_renum_only=legacy_renum_only,
             symmetrize=symmetrize,
         )
 
@@ -306,7 +297,6 @@ def from_dask_cudf_edgelist(
         edge_type=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
     ):
         """
         Initializes the distributed graph from the dask_cudf.DataFrame
@@ -353,13 +343,6 @@ def from_dask_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
-
         """
 
         if self._Impl is None:
@@ -378,7 +361,6 @@ def from_dask_cudf_edgelist(
             edge_type=edge_type,
             renumber=renumber,
             store_transposed=store_transposed,
-            legacy_renum_only=legacy_renum_only,
         )
 
     # Move to Compat Module
@@ -869,7 +851,6 @@ def from_cudf_edgelist(
         edge_attr=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
     ):
         """
         Initialize a graph from the edge list. It is an error to call this
@@ -909,13 +890,6 @@ def from_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
-
         Examples
         --------
         >>> df = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
@@ -945,7 +919,6 @@ def from_dask_cudf_edgelist(
         edge_attr=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
     ):
         """
         Initializes the distributed graph from the dask_cudf.DataFrame
@@ -980,12 +953,6 @@ def from_dask_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
         """
         raise TypeError("Distributed N-partite graph not supported")
 
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 83dad234287..ced72a6bbe2 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -14,6 +14,7 @@
 import gc
 from typing import Union, Iterable
 import warnings
+from typing import Tuple
 
 import cudf
 import cupy as cp
@@ -31,6 +32,7 @@
     degrees as pylibcugraph_degrees,
     in_degrees as pylibcugraph_in_degrees,
     out_degrees as pylibcugraph_out_degrees,
+    decompress_to_edgelist as pylibcugraph_decompress_to_edgelist,
 )
 
 from cugraph.structure.number_map import NumberMap
@@ -172,7 +174,6 @@ def __from_edgelist(
         edge_type=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
         symmetrize=None,
     ):
         if not isinstance(input_ddf, dask_cudf.DataFrame):
@@ -333,9 +334,7 @@ def __from_edgelist(
         # the edgelist_df and not do any renumbering.
         # C++ renumbering is enabled by default for algorithms that
         # support it (but only called if renumbering is on)
-        self.compute_renumber_edge_list(
-            transposed=store_transposed, legacy_renum_only=legacy_renum_only
-        )
+        self.compute_renumber_edge_list(transposed=store_transposed)
 
         if renumber is False:
             self.properties.renumbered = False
@@ -979,6 +978,84 @@ def convert_to_cudf(cp_arrays):
 
         return ddf
 
+    def decompress_to_edgelist(
+        self, return_unrenumbered_edgelist: bool = True
+    ) -> dask_cudf.DataFrame:
+        """
+        Extract a the edgelist from a graph.
+
+        Parameters
+        ----------
+        return_unrenumbered_edgelist : bool (default=True)
+                                    Flag determining whether to return the original
+                                    input edgelist if 'True' or the renumbered one
+                                    of 'False' and the edgelist was renumbered.
+
+        Returns
+        -------
+        df : dask_cudf.cudf.DataFrame
+            Distributed GPU data frame containing all induced sources identifiers,
+            destination identifiers, and if applicable edge weights, edge ids and
+            edge types
+        """
+
+        # Initialize dask client
+        client = default_client()
+
+        do_expensive_check = False
+
+        def _call_decompress_to_edgelist(
+            sID: bytes,
+            mg_graph_x,
+            do_expensive_check: bool,
+        ) -> Tuple[cp.ndarray, cp.ndarray, cp.ndarray, cp.ndarray]:
+            return pylibcugraph_decompress_to_edgelist(
+                resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                graph=mg_graph_x,
+                do_expensive_check=do_expensive_check,
+            )
+
+        result = [
+            client.submit(
+                _call_decompress_to_edgelist,
+                Comms.get_session_id(),
+                self._plc_graph[w],
+                do_expensive_check,
+            )
+            for w in Comms.get_workers()
+        ]
+        wait(result)
+
+        def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.DataFrame:
+            cp_src, cp_dst, cp_weight, cp_edge_ids, cp_edge_type_ids = cp_arrays
+
+            df = cudf.DataFrame()
+            df["src"] = cp_src
+            df["dst"] = cp_dst
+            if cp_weight is not None:
+                df["weight"] = cp_weight
+            if cp_edge_ids is not None:
+                df["edge_ids"] = cp_edge_ids
+            if cp_edge_type_ids is not None:
+                df["edge_type_ids"] = cp_edge_type_ids
+
+            return df
+
+        cudf_result = [
+            client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result
+        ]
+
+        wait(cudf_result)
+
+        ddf = dask_cudf.from_delayed(cudf_result).persist()
+        wait(ddf)
+
+        if self.properties.renumbered and return_unrenumbered_edgelist:
+            ddf = self.renumber_map.unrenumber(ddf, "src")
+            ddf = self.renumber_map.unrenumber(ddf, "dst")
+
+        return ddf
+
     def select_random_vertices(
         self, random_state: int = None, num_vertices: int = None
     ) -> Union[dask_cudf.Series, dask_cudf.DataFrame]:
@@ -1214,7 +1291,7 @@ def neighbors(self, n):
         ddf = self.edgelist.edgelist_df
         return ddf[ddf["src"] == n]["dst"].reset_index(drop=True)
 
-    def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False):
+    def compute_renumber_edge_list(self, transposed=False):
         """
         Compute a renumbered edge list
         This function works in the MNMG pipeline and will transform
@@ -1237,20 +1314,9 @@ def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False):
             structure.  If False, renumber with the intent to make
             a CSR-like structure.  Defaults to False.
 
-        legacy_renum_only : (optional) bool
-            if True, The C++ renumbering will not be triggered.
-            This parameter is added for new algos following the
-            C/Pylibcugraph path
-
             This parameter is deprecated and will be removed.
         """
 
-        if legacy_renum_only:
-            warning_msg = (
-                "The parameter 'legacy_renum_only' is deprecated and will be removed."
-            )
-            warnings.warn(warning_msg, DeprecationWarning)
-
         if not self.properties.renumber:
             self.edgelist = self.EdgeList(self.input_df)
             self.renumber_map = None
@@ -1269,7 +1335,6 @@ def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False):
                 self.source_columns,
                 self.destination_columns,
                 store_transposed=transposed,
-                legacy_renum_only=legacy_renum_only,
             )
 
             self.edgelist = self.EdgeList(renumbered_ddf)
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index 858b114ebdc..4523b7f13b8 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -14,6 +14,7 @@
 from cugraph.structure import graph_primtypes_wrapper
 from cugraph.structure.replicate_edgelist import replicate_cudf_dataframe
 from cugraph.structure.symmetrize import symmetrize as symmetrize_df
+from pylibcugraph import decompress_to_edgelist as pylibcugraph_decompress_to_edgelist
 from cugraph.structure.number_map import NumberMap
 import cugraph.dask.common.mg_utils as mg_utils
 import cudf
@@ -132,17 +133,9 @@ def __from_edgelist(
         edge_id=None,
         edge_type=None,
         renumber=True,
-        legacy_renum_only=False,
         store_transposed=False,
         symmetrize=None,
     ):
-        if legacy_renum_only:
-            warning_msg = (
-                "The parameter 'legacy_renum_only' is deprecated and will be removed."
-            )
-            warnings.warn(
-                warning_msg,
-            )
 
         if self.properties.directed and symmetrize:
             raise ValueError(
@@ -266,11 +259,7 @@ def __from_edgelist(
         if renumber:
             # FIXME: Should SG do lazy evaluation like MG?
             elist, renumber_map = NumberMap.renumber(
-                elist,
-                source,
-                destination,
-                store_transposed=False,
-                legacy_renum_only=legacy_renum_only,
+                elist, source, destination, store_transposed=False
             )
             source = renumber_map.renumbered_src_col_name
             destination = renumber_map.renumbered_dst_col_name
@@ -312,6 +301,8 @@ def __from_edgelist(
 
         # FIXME: if the user calls self.edgelist.edgelist_df after creating a
         # symmetric graph, return the symmetric edgelist?
+        # FIXME: For better memory footprint, avoid storing this edgelist and instead
+        # call decompress_to_edgelist to extract the edgelist from the graph
         self.edgelist = simpleGraphImpl.EdgeList(
             elist[source], elist[destination], value_col
         )
@@ -804,6 +795,64 @@ def get_two_hop_neighbors(self, start_vertices=None):
 
         return df
 
+    def decompress_to_edgelist(
+        self, return_unrenumbered_edgelist: bool = True
+    ) -> cudf.DataFrame:
+        """
+        Extract a the edgelist from a graph.
+
+        Parameters
+        ----------
+        return_unrenumbered_edgelist : bool (default=True)
+            Flag determining whether to return the original input edgelist
+            if 'True' or the renumbered one of 'False' and the edgelist was
+            renumbered.
+
+        Returns
+        -------
+
+        df : cudf.DataFrame
+            GPU data frame containing all sources identifiers,
+            destination identifiers and if applicable edge weights, edge ids and
+            edge types
+
+        Examples
+        --------
+        >>> from cugraph.datasets import karate
+        >>> G = karate.get_graph(download=True)
+        >>> edgelist = G.decompress_to_edgelist()
+
+        """
+
+        do_expensive_check = False
+        (
+            source,
+            destination,
+            weight,
+            edge_ids,
+            edge_type_ids,
+        ) = pylibcugraph_decompress_to_edgelist(
+            resource_handle=ResourceHandle(),
+            graph=self._plc_graph,
+            do_expensive_check=do_expensive_check,
+        )
+
+        df = cudf.DataFrame()
+        df["src"] = source
+        df["dst"] = destination
+        if weight is not None:
+            df["weight"] = weight
+        if edge_ids is not None:
+            df["edge_ids"] = edge_ids
+        if edge_type_ids is not None:
+            df["edge_type_ids"] = edge_type_ids
+
+        if self.properties.renumbered and return_unrenumbered_edgelist:
+            df, _ = self.renumber_map.unrenumber(df, "src", get_column_names=True)
+            df, _ = self.renumber_map.unrenumber(df, "dst", get_column_names=True)
+
+        return df
+
     def select_random_vertices(
         self,
         random_state: int = None,
diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py
index b0118fee960..39738daff36 100644
--- a/python/cugraph/cugraph/structure/number_map.py
+++ b/python/cugraph/cugraph/structure/number_map.py
@@ -18,7 +18,6 @@
 import dask_cudf
 import numpy as np
 import cudf
-import warnings
 
 
 class NumberMap:
@@ -462,12 +461,7 @@ def from_internal_vertex_id(
 
     @staticmethod
     def renumber_and_segment(
-        df,
-        src_col_names,
-        dst_col_names,
-        preserve_order=False,
-        store_transposed=False,
-        legacy_renum_only=False,
+        df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False
     ):
         """
         Given an input dataframe with its column names, this function returns the
@@ -475,11 +469,6 @@ def renumber_and_segment(
         to external vertex IDs. the parameter 'preserve_order' ensures that the order
         of the edges is preserved during renumbering.
         """
-        if legacy_renum_only:
-            warning_msg = (
-                "The parameter 'legacy_renum_only' is deprecated and will be removed."
-            )
-            warnings.warn(warning_msg, DeprecationWarning)
 
         renumbered = False
 
@@ -584,20 +573,10 @@ def renumber_and_segment(
 
     @staticmethod
     def renumber(
-        df,
-        src_col_names,
-        dst_col_names,
-        preserve_order=False,
-        store_transposed=False,
-        legacy_renum_only=False,
+        df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False
     ):
         return NumberMap.renumber_and_segment(
-            df,
-            src_col_names,
-            dst_col_names,
-            preserve_order,
-            store_transposed,
-            legacy_renum_only,
+            df, src_col_names, dst_col_names, preserve_order, store_transposed
         )[0:2]
 
     def unrenumber(self, df, column_name, preserve_order=False, get_column_names=False):
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
index 34ee72e799b..c9fb73babb8 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
@@ -17,6 +17,7 @@
 
 import pytest
 import networkx as nx
+import pandas as pd
 
 import cudf
 import cugraph
@@ -153,6 +154,54 @@ def networkx_call(M, benchmark_callable=None):
     return src, dst, coeff
 
 
+# FIXME: This compare is shared across several tests... it should be
+#        a general utility
+def compare(src1, dst1, val1, src2, dst2, val2):
+    #
+    #  We will do comparison computations by using dataframe
+    #  merge functions (essentially doing fast joins).  We
+    #  start by making two data frames
+    #
+    df1 = cudf.DataFrame()
+    df1["src1"] = src1
+    df1["dst1"] = dst1
+    if val1 is not None:
+        df1["val1"] = val1
+
+    df2 = cudf.DataFrame()
+    df2["src2"] = src2
+    df2["dst2"] = dst2
+    if val2 is not None:
+        df2["val2"] = val2
+
+    #
+    #  Check to see if all pairs in the original data frame
+    #  still exist in the new data frame.  If we join (merge)
+    #  the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i])
+    #  then we should get exactly the same number of entries in
+    #  the data frame if we did not lose any data.
+    #
+    join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"])
+
+    if len(df1) != len(join):
+        join2 = df1.merge(
+            df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"]
+        )
+        pd.set_option("display.max_rows", 500)
+        print("df1 = \n", df1.sort_values(["src1", "dst1"]))
+        print("df2 = \n", df2.sort_values(["src2", "dst2"]))
+        print(
+            "join2 = \n",
+            join2.sort_values(["src1", "dst1"])
+            .to_pandas()
+            .query("src2.isnull()", engine="python"),
+        )
+
+    assert len(df1) == len(join)
+
+    assert_series_equal(join["val1"], join["val2"], check_names=False)
+
+
 # =============================================================================
 # Pytest Fixtures
 # =============================================================================
@@ -415,7 +464,7 @@ def test_all_pairs_jaccard_with_topk():
     jaccard_results = (
         jaccard_results[jaccard_results["first"] != jaccard_results["second"]]
         .sort_values(["jaccard_coeff", "first", "second"], ascending=False)
-        .reset_index(drop=True)[:topk]
+        .reset_index(drop=True)
     )
 
     # Call all-pairs Jaccard
@@ -425,6 +474,37 @@ def test_all_pairs_jaccard_with_topk():
         .reset_index(drop=True)
     )
 
-    assert_frame_equal(
-        jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True
+    # 1. All pair similarity might return different top pairs k pairs
+    # which are still valid hence, ensure the pairs returned by all-pairs
+    # exists, and that any results better than the k-th result are included
+    # in the result
+
+    # FIXME: This problem could exist in overlap, cosine and sorensen,
+    #        consider replicating this code or making a share comparison
+    #        function
+    worst_coeff = all_pairs_jaccard_results["jaccard_coeff"].min()
+    better_than_k = jaccard_results[jaccard_results["jaccard_coeff"] > worst_coeff]
+
+    compare(
+        all_pairs_jaccard_results["first"],
+        all_pairs_jaccard_results["second"],
+        all_pairs_jaccard_results["jaccard_coeff"],
+        jaccard_results["first"],
+        jaccard_results["second"],
+        jaccard_results["jaccard_coeff"],
+    )
+
+    compare(
+        better_than_k["first"],
+        better_than_k["second"],
+        better_than_k["jaccard_coeff"],
+        all_pairs_jaccard_results["first"],
+        all_pairs_jaccard_results["second"],
+        all_pairs_jaccard_results["jaccard_coeff"],
+    )
+
+    # 2. Ensure the coefficient scores are still the highest
+    assert_series_equal(
+        all_pairs_jaccard_results["jaccard_coeff"],
+        jaccard_results["jaccard_coeff"][:topk],
     )
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
index 4c30f149ea5..5369398fa16 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
@@ -157,6 +157,8 @@ def networkx_call(M, benchmark_callable=None):
     return src, dst, coeff
 
 
+# FIXME: This compare is shared across several tests... it should be
+#        a general utility
 def compare(src1, dst1, val1, src2, dst2, val2):
     #
     #  We will do comparison computations by using dataframe
@@ -200,6 +202,8 @@ def compare(src1, dst1, val1, src2, dst2, val2):
 
     assert len(df1) == len(join)
 
+    assert_series_equal(join["val1"], join["val2"], check_names=False)
+
 
 # =============================================================================
 # Pytest Fixtures
@@ -456,7 +460,7 @@ def test_all_pairs_sorensen_with_topk():
     sorensen_results = (
         sorensen_results[sorensen_results["first"] != sorensen_results["second"]]
         .sort_values(["sorensen_coeff", "first", "second"], ascending=False)
-        .reset_index(drop=True)[:topk]
+        .reset_index(drop=True)
     )
 
     # Call all-pairs sorensen
@@ -468,7 +472,14 @@ def test_all_pairs_sorensen_with_topk():
 
     # 1. All pair similarity might return different top pairs k pairs
     # which are still valid hence, ensure the pairs returned by all-pairs
-    # exists.
+    # exists, and that any results better than the k-th result are included
+    # in the result
+
+    # FIXME: This problem could exist in overlap, cosine and jaccard,
+    #        consider replicating this code or making a share comparison
+    #        function
+    worst_coeff = all_pairs_sorensen_results["sorensen_coeff"].min()
+    better_than_k = sorensen_results[sorensen_results["sorensen_coeff"] > worst_coeff]
 
     compare(
         all_pairs_sorensen_results["first"],
@@ -479,6 +490,15 @@ def test_all_pairs_sorensen_with_topk():
         sorensen_results["sorensen_coeff"],
     )
 
+    compare(
+        better_than_k["first"],
+        better_than_k["second"],
+        better_than_k["sorensen_coeff"],
+        all_pairs_sorensen_results["first"],
+        all_pairs_sorensen_results["second"],
+        all_pairs_sorensen_results["sorensen_coeff"],
+    )
+
     # 2. Ensure the coefficient scores are still the highest
     assert_series_equal(
         all_pairs_sorensen_results["sorensen_coeff"],
diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py
index b3e517100e1..6fcfef726b1 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph.py
@@ -179,6 +179,58 @@ def test_add_edge_list_to_adj_list(graph_file):
     assert values_cu is None
 
 
+@pytest.mark.sg
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("is_directed", [True, False])
+@pytest.mark.parametrize("renumber", [True, False])
+def test_decompress_to_edgelist(graph_file, is_directed, renumber):
+    input_df = utils.read_csv_file(graph_file)
+    input_df = input_df.rename(columns={"0": "src", "1": "dst", "2": "weight"})
+
+    G = cugraph.Graph(directed=is_directed)
+    input_df_ = cudf.DataFrame()
+    if renumber:
+        input_df_["src_0"] = cudf.Series(input_df["src"])
+        input_df_["dst_0"] = cudf.Series(input_df["dst"])
+        input_df_["weight"] = cudf.Series(input_df["weight"])
+        input_df_["src_1"] = input_df_["src_0"] + 1000
+        input_df_["dst_1"] = input_df_["dst_0"] + 1000
+
+        input_df = input_df_
+        source = ["src_0", "src_1"]
+        destination = ["dst_0", "dst_1"]
+    else:
+        source = "src"
+        destination = "dst"
+
+    G.from_cudf_edgelist(
+        input_df, source=source, destination=destination, weight="weight", renumber=True
+    )
+
+    extracted_df = G.decompress_to_edgelist(return_unrenumbered_edgelist=True)
+
+    if renumber:
+        extracted_df = extracted_df.rename(
+            columns={
+                "0_src": "src_0",
+                "1_src": "src_1",
+                "0_dst": "dst_0",
+                "1_dst": "dst_1",
+            }
+        )
+        extracted_df = extracted_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
+        input_df = input_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
+    else:
+        extracted_df = extracted_df.sort_values(["src", "dst"]).reset_index(drop=True)
+        input_df = input_df.sort_values(["src", "dst"]).reset_index(drop=True)
+
+    assert_frame_equal(input_df, extracted_df, check_dtype=False, check_like=True)
+
+
 # Test
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
index f2cc1583f93..e5eeb0f653b 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
@@ -420,3 +420,57 @@ def test_graph_creation_properties(dask_client, graph_file, directed, renumber):
     assert sG.number_of_nodes() == mG.number_of_nodes()
     assert sG.number_of_edges() == mG.number_of_edges()
     assert_frame_equal(sG_edgelist_view, mG_edgelist_view, check_dtype=False)
+
+
+@pytest.mark.parametrize("directed", [True, False])
+@pytest.mark.parametrize("renumber", [True, False])
+@pytest.mark.parametrize("graph_file", datasets)
+def test_decompress_to_edgelist(dask_client, graph_file, directed, renumber):
+    input_df = utils.read_csv_file(graph_file)
+    input_df = input_df.rename(columns={"0": "src", "1": "dst", "2": "weight"})
+
+    G = cugraph.Graph(directed=directed)
+    input_df_ = cudf.DataFrame()
+    if renumber:
+        input_df_["src_0"] = cudf.Series(input_df["src"])
+        input_df_["dst_0"] = cudf.Series(input_df["dst"])
+        input_df_["weight"] = cudf.Series(input_df["weight"])
+        input_df_["src_1"] = input_df_["src_0"] + 1000
+        input_df_["dst_1"] = input_df_["dst_0"] + 1000
+
+        input_df = input_df_
+        source = ["src_0", "src_1"]
+        destination = ["dst_0", "dst_1"]
+    else:
+        source = "src"
+        destination = "dst"
+    num_workers = len(Comms.get_workers())
+
+    input_ddf = dask_cudf.from_cudf(input_df, npartitions=num_workers)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(
+        input_ddf, source=source, destination=destination, weight="weight"
+    )
+
+    extracted_df = (
+        G.decompress_to_edgelist(return_unrenumbered_edgelist=True)
+        .compute()
+        .reset_index(drop=True)
+    )
+
+    if renumber:
+        extracted_df = extracted_df.rename(
+            columns={
+                "0_src": "src_0",
+                "1_src": "src_1",
+                "0_dst": "dst_0",
+                "1_dst": "dst_1",
+            }
+        )
+        extracted_df = extracted_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
+        input_df = input_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
diff --git a/python/cugraph/cugraph/traversal/bfs.py b/python/cugraph/cugraph/traversal/bfs.py
index cad96947f8b..2335b1d390c 100644
--- a/python/cugraph/cugraph/traversal/bfs.py
+++ b/python/cugraph/cugraph/traversal/bfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -135,6 +135,11 @@ def bfs(
         information. Edge weights, if present, should be single or double
         precision floating point values.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     start : Integer or list, optional (default=None)
         The id of the graph vertex from which the traversal begins, or
         if a list, the vertex from which the traversal begins in each
@@ -265,6 +270,11 @@ def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
         information. Edge weights, if present, should be single or double
         precision floating point values.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     source : Integer
         The starting vertex index
 
diff --git a/python/cugraph/cugraph/traversal/sssp.py b/python/cugraph/cugraph/traversal/sssp.py
index bb98b5a9a29..ad3f3c716d1 100644
--- a/python/cugraph/cugraph/traversal/sssp.py
+++ b/python/cugraph/cugraph/traversal/sssp.py
@@ -158,6 +158,12 @@ def sssp(
         weights, if present, should be single or double precision floating
         point values.
         The current implementation only supports weighted graphs.
+
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     source : int
         Index of the source vertex.
     cutoff : double, optional (default=None)
@@ -323,6 +329,11 @@ def shortest_path_length(G, source, target=None):
         cuGraph graph descriptor with connectivity information. Edge weights,
         if present, should be single or double precision floating point values.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     source : Dependant on graph type. Index of the source vertex.
 
     If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix:
diff --git a/python/cugraph/cugraph/tree/minimum_spanning_tree.py b/python/cugraph/cugraph/tree/minimum_spanning_tree.py
index b297042f199..342ee0b77fe 100644
--- a/python/cugraph/cugraph/tree/minimum_spanning_tree.py
+++ b/python/cugraph/cugraph/tree/minimum_spanning_tree.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -72,6 +72,11 @@ def minimum_spanning_tree(G, weight=None, algorithm="boruvka", ignore_nan=False)
     G : cuGraph.Graph or networkx.Graph
         cuGraph graph descriptor with connectivity information.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     weight : string
         default to the weights in the graph, if the graph edges do not have a
         weight attribute a default weight of 1 will be used.
@@ -115,6 +120,11 @@ def maximum_spanning_tree(G, weight=None, algorithm="boruvka", ignore_nan=False)
     G : cuGraph.Graph or networkx.Graph
         cuGraph graph descriptor with connectivity information.
 
+        .. deprecated:: 24.12
+           Accepting a ``networkx.Graph`` is deprecated and will be removed in a
+           future version.  For ``networkx.Graph`` use networkx directly with
+           the ``nx-cugraph`` backend. See:  https://rapids.ai/nx-cugraph/
+
     weight : string
         default to the weights in the graph, if the graph edges do not have a
         weight attribute a default weight of 1 will be used.
diff --git a/python/cugraph/cugraph/utilities/utils.py b/python/cugraph/cugraph/utilities/utils.py
index 69616f26857..5bad68a79e2 100644
--- a/python/cugraph/cugraph/utilities/utils.py
+++ b/python/cugraph/cugraph/utilities/utils.py
@@ -23,6 +23,7 @@
 from cuda.cudart import cudaDeviceAttr
 from rmm._cuda.gpu import getDeviceAttribute
 
+from warnings import warn
 
 # optional dependencies
 try:
@@ -334,6 +335,12 @@ def ensure_cugraph_obj_for_nx(
 
     input_type = type(obj)
     if is_nx_graph_type(input_type):
+        warn(
+            "Support for accepting and returning NetworkX objects is "
+            "deprecated. Please use NetworkX with the nx-cugraph backend",
+            DeprecationWarning,
+            2,
+        )
         return (
             convert_from_nx(
                 obj,
diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini
index 5cbc4631664..bf6e6bdd802 100644
--- a/python/cugraph/pytest.ini
+++ b/python/cugraph/pytest.ini
@@ -71,3 +71,4 @@ filterwarnings =
           # Called via dask. Not obviously addressable in cugraph.
           ignore:The behavior of array concatenation with empty entries is deprecated:FutureWarning
           ignore:This method is deprecated and will no longer be supported. The symmetrization:FutureWarning
+          ignore:Support for accepting and returning NetworkX objects is deprecated. Please use NetworkX with the nx-cugraph backend:DeprecationWarning
diff --git a/python/nx-cugraph/.flake8 b/python/nx-cugraph/.flake8
deleted file mode 100644
index cdda8d1080f..00000000000
--- a/python/nx-cugraph/.flake8
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-[flake8]
-max-line-length = 88
-inline-quotes = "
-extend-ignore =
-    B020,
-    E203,
-    SIM105,
-    SIM401,
-# E203 whitespace before ':' (to be compatible with black)
-per-file-ignores =
-    nx_cugraph/tests/*.py:T201,
-    __init__.py:F401,F403,
-    _nx_cugraph/__init__.py:E501,
diff --git a/python/nx-cugraph/LICENSE b/python/nx-cugraph/LICENSE
deleted file mode 120000
index 30cff7403da..00000000000
--- a/python/nx-cugraph/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../../LICENSE
\ No newline at end of file
diff --git a/python/nx-cugraph/Makefile b/python/nx-cugraph/Makefile
deleted file mode 100644
index 6500d834ee7..00000000000
--- a/python/nx-cugraph/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-SHELL= /bin/bash
-
-.PHONY: all
-all: plugin-info lint readme
-
-.PHONY: lint
-lint:
-	git ls-files | xargs pre-commit run --config lint.yaml --files || true
-
-.PHONY: lint-update
-lint-update:
-	pre-commit autoupdate --config lint.yaml
-
-.PHONY: plugin-info
-plugin-info:
-	python _nx_cugraph/__init__.py
-
-objects.inv:
-	wget https://networkx.org/documentation/stable/objects.inv
-
-.PHONY: readme
-readme: objects.inv
-	python scripts/update_readme.py README.md objects.inv
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
deleted file mode 100644
index 8cc3a5d90df..00000000000
--- a/python/nx-cugraph/README.md
+++ /dev/null
@@ -1,278 +0,0 @@
-# nx-cugraph
-
-## Description
-[RAPIDS](https://rapids.ai) nx-cugraph is a [backend to NetworkX](https://networkx.org/documentation/stable/backends.html)
-to run supported algorithms with GPU acceleration.
-
-## System Requirements
-
-nx-cugraph requires the following:
- * NVIDIA GPU, Volta architecture or later, with [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0+
- * CUDA 11.2, 11.4, 11.5, 11.8, 12.0, 12.2, or 12.5
- * Python version 3.10, 3.11, or 3.12
- * NetworkX >= version 3.0 (version 3.4 or higher recommended)
-
-More details about system requirements can be found in the [RAPIDS System Requirements documentation](https://docs.rapids.ai/install#system-req).
-
-## Installation
-
-nx-cugraph can be installed using either conda or pip.
-
-### conda
-#### latest nightly version
-```
-conda install -c rapidsai-nightly -c conda-forge -c nvidia nx-cugraph
-```
-#### latest stable version
-```
-conda install -c rapidsai -c conda-forge -c nvidia nx-cugraph
-```
-### pip
-#### latest nightly version
-```
-python -m pip install nx-cugraph-cu11 --extra-index-url https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-```
-#### latest stable version
-```
-python -m pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com
-```
-Notes:
- * The pip example above installs for CUDA 11. To install for CUDA 12, replace `-cu11` with `-cu12`
- * Additional information relevant to installing any RAPIDS package can be found [here](https://rapids.ai/#quick-start).
-
-## Enabling nx-cugraph
-
-NetworkX will use nx-cugraph as the graph analytics backend if any of the
-following are used:
-
-### `NX_CUGRAPH_AUTOCONFIG` environment variable.
-By setting `NX_CUGRAPH_AUTOCONFIG=True`, NetworkX will automatically dispatch algorithm calls to nx-cugraph (if the backend is supported). This allows users to GPU accelerate their code with zero code change.
-
-Read more on [Networkx Backends and How They Work](https://networkx.org/documentation/stable/reference/backends.html).
-
-Example:
-```
-bash> NX_CUGRAPH_AUTOCONFIG=True python my_networkx_script.py
-```
-
-### `backend=` keyword argument
-To explicitly specify a particular backend for an API, use the `backend=`
-keyword argument. This argument takes precedence over the
-`NX_CUGRAPH_AUTOCONFIG` environment variable. This requires anyone
-running code that uses the `backend=` keyword argument to have the specified
-backend installed.
-
-Example:
-```
-nx.betweenness_centrality(cit_patents_graph, k=k, backend="cugraph")
-```
-
-### Type-based dispatching
-
-NetworkX also supports automatically dispatching to backends associated with
-specific graph types. Like the `backend=` keyword argument example above, this
-requires the user to write code for a specific backend, and therefore requires
-the backend to be installed, but has the advantage of ensuring a particular
-behavior without the potential for runtime conversions.
-
-To use type-based dispatching with nx-cugraph, the user must import the backend
-directly in their code to access the utilities provided to create a Graph
-instance specifically for the nx-cugraph backend.
-
-Example:
-```
-import networkx as nx
-import nx_cugraph as nxcg
-
-G = nx.Graph()
-...
-nxcg_G = nxcg.from_networkx(G)             # conversion happens once here
-nx.betweenness_centrality(nxcg_G, k=1000)  # nxcg Graph type causes cugraph backend
-                                           # to be used, no conversion necessary
-```
-
-## Supported Algorithms
-
-The nx-cugraph backend to NetworkX connects
-[pylibcugraph](../../readme_pages/pylibcugraph.md) (cuGraph's low-level python
-interface to its CUDA-based graph analytics library) and
-[CuPy](https://cupy.dev/) (a GPU-accelerated array library) to NetworkX's
-familiar and easy-to-use API.
-
-Below is the list of algorithms that are currently supported in nx-cugraph.
-
-### [Algorithms](https://networkx.org/documentation/latest/reference/algorithms/index.html)
-
-<pre>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite">bipartite</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/bipartite.html#module-networkx.algorithms.bipartite.generators">generators</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.generators.complete_bipartite_graph.html#networkx.algorithms.bipartite.generators.complete_bipartite_graph">complete_bipartite_graph</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#module-networkx.algorithms.centrality">centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-betweenness">betweenness</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality">betweenness_centrality</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.edge_betweenness_centrality.html#networkx.algorithms.centrality.edge_betweenness_centrality">edge_betweenness_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-degree-alg">degree_alg</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality">degree_centrality</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.in_degree_centrality.html#networkx.algorithms.centrality.in_degree_centrality">in_degree_centrality</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality">out_degree_centrality</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-eigenvector">eigenvector</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html#networkx.algorithms.centrality.eigenvector_centrality">eigenvector_centrality</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/centrality.html#networkx-algorithms-centrality-katz">katz</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality">katz_centrality</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/clustering.html#module-networkx.algorithms.cluster">cluster</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.average_clustering.html#networkx.algorithms.cluster.average_clustering">average_clustering</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.clustering.html#networkx.algorithms.cluster.clustering">clustering</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.transitivity.html#networkx.algorithms.cluster.transitivity">transitivity</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.triangles.html#networkx.algorithms.cluster.triangles">triangles</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community">community</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/community.html#module-networkx.algorithms.community.louvain">louvain</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html#networkx.algorithms.community.louvain.louvain_communities">louvain_communities</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#module-networkx.algorithms.components">components</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-connected">connected</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.connected_components.html#networkx.algorithms.components.connected_components">connected_components</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_connected.html#networkx.algorithms.components.is_connected">is_connected</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.node_connected_component.html#networkx.algorithms.components.node_connected_component">node_connected_component</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_connected_components.html#networkx.algorithms.components.number_connected_components">number_connected_components</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/component.html#networkx-algorithms-components-weakly-connected">weakly_connected</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.is_weakly_connected.html#networkx.algorithms.components.is_weakly_connected">is_weakly_connected</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.number_weakly_connected_components.html#networkx.algorithms.components.number_weakly_connected_components">number_weakly_connected_components</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.weakly_connected_components.html#networkx.algorithms.components.weakly_connected_components">weakly_connected_components</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/core.html#module-networkx.algorithms.core">core</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.core.core_number.html#networkx.algorithms.core.core_number">core_number</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.core.k_truss.html#networkx.algorithms.core.k_truss">k_truss</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/dag.html#module-networkx.algorithms.dag">dag</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.dag.ancestors.html#networkx.algorithms.dag.ancestors">ancestors</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.dag.descendants.html#networkx.algorithms.dag.descendants">descendants</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/isolates.html#module-networkx.algorithms.isolate">isolate</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.isolate.is_isolate.html#networkx.algorithms.isolate.is_isolate">is_isolate</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.isolate.isolates.html#networkx.algorithms.isolate.isolates">isolates</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.isolate.number_of_isolates.html#networkx.algorithms.isolate.number_of_isolates">number_of_isolates</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/link_analysis.html">link_analysis</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/link_analysis.html#module-networkx.algorithms.link_analysis.hits_alg">hits_alg</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.hits_alg.hits.html#networkx.algorithms.link_analysis.hits_alg.hits">hits</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/link_analysis.html#module-networkx.algorithms.link_analysis.pagerank_alg">pagerank_alg</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank">pagerank</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/operators.html">operators</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/operators.html#module-networkx.algorithms.operators.unary">unary</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.unary.complement.html#networkx.algorithms.operators.unary.complement">complement</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.unary.reverse.html#networkx.algorithms.operators.unary.reverse">reverse</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/reciprocity.html#module-networkx.algorithms.reciprocity">reciprocity</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.reciprocity.overall_reciprocity.html#networkx.algorithms.reciprocity.overall_reciprocity">overall_reciprocity</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.reciprocity.reciprocity.html#networkx.algorithms.reciprocity.reciprocity">reciprocity</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html">shortest_paths</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.generic">generic</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.has_path.html#networkx.algorithms.shortest_paths.generic.has_path">has_path</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.shortest_path.html#networkx.algorithms.shortest_paths.generic.shortest_path">shortest_path</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.shortest_path_length.html#networkx.algorithms.shortest_paths.generic.shortest_path_length">shortest_path_length</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.unweighted">unweighted</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path">all_pairs_shortest_path</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.all_pairs_shortest_path_length">all_pairs_shortest_path_length</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.bidirectional_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.bidirectional_shortest_path">bidirectional_shortest_path</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path">single_source_shortest_path</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.single_source_shortest_path_length">single_source_shortest_path_length</a>
- │   ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path.html#networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path">single_target_shortest_path</a>
- │   └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path_length.html#networkx.algorithms.shortest_paths.unweighted.single_target_shortest_path_length">single_target_shortest_path_length</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.weighted">weighted</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path">all_pairs_bellman_ford_path</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length">all_pairs_bellman_ford_path_length</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra">all_pairs_dijkstra</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path">all_pairs_dijkstra_path</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path_length">all_pairs_dijkstra_path_length</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path">bellman_ford_path</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length">bellman_ford_path_length</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.dijkstra_path">dijkstra_path</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.dijkstra_path_length">dijkstra_path_length</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford">single_source_bellman_ford</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path">single_source_bellman_ford_path</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length">single_source_bellman_ford_path_length</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra">single_source_dijkstra</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path">single_source_dijkstra_path</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path_length">single_source_dijkstra_path_length</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html">traversal</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html#module-networkx.algorithms.traversal.breadth_first_search">breadth_first_search</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_edges.html#networkx.algorithms.traversal.breadth_first_search.bfs_edges">bfs_edges</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_layers.html#networkx.algorithms.traversal.breadth_first_search.bfs_layers">bfs_layers</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_predecessors.html#networkx.algorithms.traversal.breadth_first_search.bfs_predecessors">bfs_predecessors</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_successors.html#networkx.algorithms.traversal.breadth_first_search.bfs_successors">bfs_successors</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_tree.html#networkx.algorithms.traversal.breadth_first_search.bfs_tree">bfs_tree</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.descendants_at_distance.html#networkx.algorithms.traversal.breadth_first_search.descendants_at_distance">descendants_at_distance</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.generic_bfs_edges.html#networkx.algorithms.traversal.breadth_first_search.generic_bfs_edges">generic_bfs_edges</a>
-<a href="https://networkx.org/documentation/stable/reference/algorithms/tree.html">tree</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/tree.html#module-networkx.algorithms.tree.recognition">recognition</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.tree.recognition.is_arborescence.html#networkx.algorithms.tree.recognition.is_arborescence">is_arborescence</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.tree.recognition.is_branching.html#networkx.algorithms.tree.recognition.is_branching">is_branching</a>
-     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.tree.recognition.is_forest.html#networkx.algorithms.tree.recognition.is_forest">is_forest</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.tree.recognition.is_tree.html#networkx.algorithms.tree.recognition.is_tree">is_tree</a>
-</pre>
-
-### [Generators](https://networkx.org/documentation/latest/reference/generators.html)
-
-<pre>
-<a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.classic">classic</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.barbell_graph.html#networkx.generators.classic.barbell_graph">barbell_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.circular_ladder_graph.html#networkx.generators.classic.circular_ladder_graph">circular_ladder_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.complete_graph.html#networkx.generators.classic.complete_graph">complete_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.complete_multipartite_graph.html#networkx.generators.classic.complete_multipartite_graph">complete_multipartite_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.cycle_graph.html#networkx.generators.classic.cycle_graph">cycle_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.empty_graph.html#networkx.generators.classic.empty_graph">empty_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.ladder_graph.html#networkx.generators.classic.ladder_graph">ladder_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.lollipop_graph.html#networkx.generators.classic.lollipop_graph">lollipop_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.null_graph.html#networkx.generators.classic.null_graph">null_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.path_graph.html#networkx.generators.classic.path_graph">path_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.star_graph.html#networkx.generators.classic.star_graph">star_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.tadpole_graph.html#networkx.generators.classic.tadpole_graph">tadpole_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.trivial_graph.html#networkx.generators.classic.trivial_graph">trivial_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.turan_graph.html#networkx.generators.classic.turan_graph">turan_graph</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.classic.wheel_graph.html#networkx.generators.classic.wheel_graph">wheel_graph</a>
-<a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.community">community</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.community.caveman_graph.html#networkx.generators.community.caveman_graph">caveman_graph</a>
-<a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.ego">ego</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.ego.ego_graph.html#networkx.generators.ego.ego_graph">ego_graph</a>
-<a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.small">small</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.bull_graph.html#networkx.generators.small.bull_graph">bull_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.chvatal_graph.html#networkx.generators.small.chvatal_graph">chvatal_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.cubical_graph.html#networkx.generators.small.cubical_graph">cubical_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.desargues_graph.html#networkx.generators.small.desargues_graph">desargues_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.diamond_graph.html#networkx.generators.small.diamond_graph">diamond_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.dodecahedral_graph.html#networkx.generators.small.dodecahedral_graph">dodecahedral_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.frucht_graph.html#networkx.generators.small.frucht_graph">frucht_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.heawood_graph.html#networkx.generators.small.heawood_graph">heawood_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.house_graph.html#networkx.generators.small.house_graph">house_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.house_x_graph.html#networkx.generators.small.house_x_graph">house_x_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.icosahedral_graph.html#networkx.generators.small.icosahedral_graph">icosahedral_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.krackhardt_kite_graph.html#networkx.generators.small.krackhardt_kite_graph">krackhardt_kite_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.moebius_kantor_graph.html#networkx.generators.small.moebius_kantor_graph">moebius_kantor_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.octahedral_graph.html#networkx.generators.small.octahedral_graph">octahedral_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.pappus_graph.html#networkx.generators.small.pappus_graph">pappus_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.petersen_graph.html#networkx.generators.small.petersen_graph">petersen_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.sedgewick_maze_graph.html#networkx.generators.small.sedgewick_maze_graph">sedgewick_maze_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.tetrahedral_graph.html#networkx.generators.small.tetrahedral_graph">tetrahedral_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.truncated_cube_graph.html#networkx.generators.small.truncated_cube_graph">truncated_cube_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.truncated_tetrahedron_graph.html#networkx.generators.small.truncated_tetrahedron_graph">truncated_tetrahedron_graph</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.small.tutte_graph.html#networkx.generators.small.tutte_graph">tutte_graph</a>
-<a href="https://networkx.org/documentation/stable/reference/generators.html#module-networkx.generators.social">social</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.social.davis_southern_women_graph.html#networkx.generators.social.davis_southern_women_graph">davis_southern_women_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.social.florentine_families_graph.html#networkx.generators.social.florentine_families_graph">florentine_families_graph</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.social.karate_club_graph.html#networkx.generators.social.karate_club_graph">karate_club_graph</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.generators.social.les_miserables_graph.html#networkx.generators.social.les_miserables_graph">les_miserables_graph</a>
-</pre>
-
-### Other
-
-<pre>
-<a href="https://networkx.org/documentation/stable/reference/classes/index.html">classes</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/functions.html#module-networkx.classes.function">function</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.is_negatively_weighted.html#networkx.classes.function.is_negatively_weighted">is_negatively_weighted</a>
-<a href="https://networkx.org/documentation/stable/reference/convert.html#module-networkx.convert">convert</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert.from_dict_of_lists.html#networkx.convert.from_dict_of_lists">from_dict_of_lists</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert.to_dict_of_lists.html#networkx.convert.to_dict_of_lists">to_dict_of_lists</a>
-<a href="https://networkx.org/documentation/stable/reference/convert.html#module-networkx.convert_matrix">convert_matrix</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html#networkx.convert_matrix.from_pandas_edgelist">from_pandas_edgelist</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_scipy_sparse_array.html#networkx.convert_matrix.from_scipy_sparse_array">from_scipy_sparse_array</a>
-<a href="https://networkx.org/documentation/stable/reference/relabel.html#module-networkx.relabel">relabel</a>
- ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.relabel.convert_node_labels_to_integers.html#networkx.relabel.convert_node_labels_to_integers">convert_node_labels_to_integers</a>
- └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.relabel.relabel_nodes.html#networkx.relabel.relabel_nodes">relabel_nodes</a>
-</pre>
-
-To request nx-cugraph backend support for a NetworkX API that is not listed
-above, visit the [cuGraph GitHub repo](https://github.com/rapidsai/cugraph).
diff --git a/python/nx-cugraph/_nx_cugraph/VERSION b/python/nx-cugraph/_nx_cugraph/VERSION
deleted file mode 120000
index d62dc733efd..00000000000
--- a/python/nx-cugraph/_nx_cugraph/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../../VERSION
\ No newline at end of file
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
deleted file mode 100644
index 9feeda568a6..00000000000
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tell NetworkX about the cugraph backend. This file can update itself:
-
-$ make plugin-info
-
-or
-
-$ make all  # Recommended - runs 'plugin-info' followed by 'lint'
-
-or
-
-$ python _nx_cugraph/__init__.py
-"""
-import os
-
-from _nx_cugraph._version import __version__
-
-# This is normally handled by packaging.version.Version, but instead of adding
-# an additional runtime dependency on "packaging", assume __version__ will
-# always be in <major>.<minor>.<build> format.
-(_version_major, _version_minor) = __version__.split(".")[:2]
-
-# Entries between BEGIN and END are automatically generated
-_info = {
-    "backend_name": "cugraph",
-    "project": "nx-cugraph",
-    "package": "nx_cugraph",
-    "url": "https://rapids.ai/nx-cugraph",
-    "short_summary": "GPU-accelerated backend.",
-    # "description": "TODO",
-    "functions": {
-        # BEGIN: functions
-        "all_pairs_bellman_ford_path",
-        "all_pairs_bellman_ford_path_length",
-        "all_pairs_dijkstra",
-        "all_pairs_dijkstra_path",
-        "all_pairs_dijkstra_path_length",
-        "all_pairs_shortest_path",
-        "all_pairs_shortest_path_length",
-        "ancestors",
-        "average_clustering",
-        "barbell_graph",
-        "bellman_ford_path",
-        "bellman_ford_path_length",
-        "betweenness_centrality",
-        "bfs_edges",
-        "bfs_layers",
-        "bfs_predecessors",
-        "bfs_successors",
-        "bfs_tree",
-        "bidirectional_shortest_path",
-        "bull_graph",
-        "caveman_graph",
-        "chvatal_graph",
-        "circular_ladder_graph",
-        "clustering",
-        "complement",
-        "complete_bipartite_graph",
-        "complete_graph",
-        "complete_multipartite_graph",
-        "connected_components",
-        "convert_node_labels_to_integers",
-        "core_number",
-        "cubical_graph",
-        "cycle_graph",
-        "davis_southern_women_graph",
-        "degree_centrality",
-        "desargues_graph",
-        "descendants",
-        "descendants_at_distance",
-        "diamond_graph",
-        "dijkstra_path",
-        "dijkstra_path_length",
-        "dodecahedral_graph",
-        "edge_betweenness_centrality",
-        "ego_graph",
-        "eigenvector_centrality",
-        "empty_graph",
-        "florentine_families_graph",
-        "from_dict_of_lists",
-        "from_pandas_edgelist",
-        "from_scipy_sparse_array",
-        "frucht_graph",
-        "generic_bfs_edges",
-        "has_path",
-        "heawood_graph",
-        "hits",
-        "house_graph",
-        "house_x_graph",
-        "icosahedral_graph",
-        "in_degree_centrality",
-        "is_arborescence",
-        "is_branching",
-        "is_connected",
-        "is_forest",
-        "is_isolate",
-        "is_negatively_weighted",
-        "is_tree",
-        "is_weakly_connected",
-        "isolates",
-        "k_truss",
-        "karate_club_graph",
-        "katz_centrality",
-        "krackhardt_kite_graph",
-        "ladder_graph",
-        "les_miserables_graph",
-        "lollipop_graph",
-        "louvain_communities",
-        "moebius_kantor_graph",
-        "node_connected_component",
-        "null_graph",
-        "number_connected_components",
-        "number_of_isolates",
-        "number_of_selfloops",
-        "number_weakly_connected_components",
-        "octahedral_graph",
-        "out_degree_centrality",
-        "overall_reciprocity",
-        "pagerank",
-        "pappus_graph",
-        "path_graph",
-        "petersen_graph",
-        "reciprocity",
-        "relabel_nodes",
-        "reverse",
-        "sedgewick_maze_graph",
-        "shortest_path",
-        "shortest_path_length",
-        "single_source_bellman_ford",
-        "single_source_bellman_ford_path",
-        "single_source_bellman_ford_path_length",
-        "single_source_dijkstra",
-        "single_source_dijkstra_path",
-        "single_source_dijkstra_path_length",
-        "single_source_shortest_path",
-        "single_source_shortest_path_length",
-        "single_target_shortest_path",
-        "single_target_shortest_path_length",
-        "star_graph",
-        "tadpole_graph",
-        "tetrahedral_graph",
-        "to_dict_of_lists",
-        "transitivity",
-        "triangles",
-        "trivial_graph",
-        "truncated_cube_graph",
-        "truncated_tetrahedron_graph",
-        "turan_graph",
-        "tutte_graph",
-        "weakly_connected_components",
-        "wheel_graph",
-        # END: functions
-    },
-    "additional_docs": {
-        # BEGIN: additional_docs
-        "all_pairs_bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
-        "all_pairs_bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
-        "average_clustering": "Directed graphs and `weight` parameter are not yet supported.",
-        "bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
-        "bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
-        "betweenness_centrality": "`weight` parameter is not yet supported, and RNG with seed may be different.",
-        "bfs_edges": "`sort_neighbors` parameter is not yet supported.",
-        "bfs_predecessors": "`sort_neighbors` parameter is not yet supported.",
-        "bfs_successors": "`sort_neighbors` parameter is not yet supported.",
-        "bfs_tree": "`sort_neighbors` parameter is not yet supported.",
-        "clustering": "Directed graphs and `weight` parameter are not yet supported.",
-        "core_number": "Directed graphs are not yet supported.",
-        "edge_betweenness_centrality": "`weight` parameter is not yet supported, and RNG with seed may be different.",
-        "ego_graph": "Weighted ego_graph with negative cycles is not yet supported. `NotImplementedError` will be raised if there are negative `distance` edge weights.",
-        "eigenvector_centrality": "`nstart` parameter is not used, but it is checked for validity.",
-        "from_pandas_edgelist": "cudf.DataFrame inputs also supported; value columns with str is unsuppported.",
-        "generic_bfs_edges": "`neighbors` parameter is not yet supported.",
-        "katz_centrality": "`nstart` isn't used (but is checked), and `normalized=False` is not supported.",
-        "louvain_communities": "`seed` parameter is currently ignored, and self-loops are not yet supported.",
-        "pagerank": "`dangling` parameter is not supported, but it is checked for validity.",
-        "shortest_path": "Negative weights are not yet supported.",
-        "shortest_path_length": "Negative weights are not yet supported.",
-        "single_source_bellman_ford": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
-        "single_source_bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
-        "single_source_bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
-        "transitivity": "Directed graphs are not yet supported.",
-        # END: additional_docs
-    },
-    "additional_parameters": {
-        # BEGIN: additional_parameters
-        "all_pairs_bellman_ford_path": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "all_pairs_bellman_ford_path_length": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "all_pairs_dijkstra": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "all_pairs_dijkstra_path": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "all_pairs_dijkstra_path_length": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "bellman_ford_path": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "bellman_ford_path_length": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "dijkstra_path": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "dijkstra_path_length": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "ego_graph": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "eigenvector_centrality": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "hits": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-            'weight : string or None, optional (default="weight")': "The edge attribute to use as the edge weight.",
-        },
-        "katz_centrality": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "louvain_communities": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "pagerank": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "shortest_path": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "shortest_path_length": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "single_source_bellman_ford": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "single_source_bellman_ford_path": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "single_source_bellman_ford_path_length": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "single_source_dijkstra": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "single_source_dijkstra_path": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        "single_source_dijkstra_path_length": {
-            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
-        },
-        # END: additional_parameters
-    },
-}
-
-
-def get_info():
-    """Target of ``networkx.plugin_info`` entry point.
-
-    This tells NetworkX about the cugraph backend without importing nx_cugraph.
-    """
-    # Convert to e.g. `{"functions": {"myfunc": {"additional_docs": ...}}}`
-    d = _info.copy()
-    info_keys = {"additional_docs", "additional_parameters"}
-    d["functions"] = {
-        func: {
-            info_key: vals[func]
-            for info_key in info_keys
-            if func in (vals := d[info_key])
-        }
-        for func in d["functions"]
-    }
-    # Add keys for Networkx <3.3
-    for func_info in d["functions"].values():
-        if "additional_docs" in func_info:
-            func_info["extra_docstring"] = func_info["additional_docs"]
-        if "additional_parameters" in func_info:
-            func_info["extra_parameters"] = func_info["additional_parameters"]
-
-    for key in info_keys:
-        del d[key]
-
-    d["default_config"] = {
-        "use_compat_graphs": os.environ.get("NX_CUGRAPH_USE_COMPAT_GRAPHS", "true")
-        .strip()
-        .lower()
-        == "true",
-    }
-
-    # Enable zero-code change usage with a simple environment variable
-    # by setting or updating other NETWORKX environment variables.
-    if os.environ.get("NX_CUGRAPH_AUTOCONFIG", "").strip().lower() == "true":
-        from itertools import chain
-
-        def update_env_var(varname):
-            """Add "cugraph" to a list of backend names environment variable."""
-            if varname not in os.environ:
-                os.environ[varname] = "cugraph"
-                return
-            string = os.environ[varname]
-            vals = [
-                stripped for x in string.strip().split(",") if (stripped := x.strip())
-            ]
-            if "cugraph" not in vals:
-                # Should we append or prepend? Let's be first!
-                os.environ[varname] = ",".join(chain(["cugraph"], vals))
-
-        # Automatically convert NetworkX Graphs to nx-cugraph for algorithms
-        if (varname := "NETWORKX_BACKEND_PRIORITY_ALGOS") in os.environ:
-            # "*_ALGOS" is given priority in NetworkX >=3.4
-            update_env_var(varname)
-            # But update this too to "just work" if users mix env vars and nx versions
-            os.environ["NETWORKX_BACKEND_PRIORITY"] = os.environ[varname]
-        else:
-            update_env_var("NETWORKX_BACKEND_PRIORITY")
-        # And for older NetworkX versions
-        update_env_var("NETWORKX_AUTOMATIC_BACKENDS")  # For NetworkX 3.2
-        update_env_var("NETWORKX_GRAPH_CONVERT")  # For NetworkX 3.0 and 3.1
-        # Automatically create nx-cugraph Graph from graph generators
-        update_env_var("NETWORKX_BACKEND_PRIORITY_GENERATORS")
-        # Run default NetworkX implementation (in >=3.4) if not implemented by nx-cugraph
-        if (varname := "NETWORKX_FALLBACK_TO_NX") not in os.environ:
-            os.environ[varname] = "true"
-        # Cache graph conversions (default is False in NetworkX 3.2
-        if (varname := "NETWORKX_CACHE_CONVERTED_GRAPHS") not in os.environ:
-            os.environ[varname] = "true"
-
-    return d
-
-
-def _check_networkx_version() -> tuple[int, int]:
-    """Check the version of networkx and return ``(major, minor)`` version tuple."""
-    import re
-    import warnings
-
-    import networkx as nx
-
-    version_major, version_minor = nx.__version__.split(".")[:2]
-    if version_major != "3":
-        warnings.warn(
-            f"nx-cugraph version {__version__} is only known to work with networkx "
-            f"versions 3.x, but networkx {nx.__version__} is installed. "
-            "Perhaps try upgrading your Python environment.",
-            UserWarning,
-            stacklevel=2,
-        )
-
-    # Allow single-digit minor versions, e.g. 3.4 and release candidates, e.g. 3.4rc0
-    pattern = r"^\d(rc\d+)?$"
-
-    if not re.match(pattern, version_minor):
-        raise RuntimeWarning(
-            f"nx-cugraph version {__version__} does not work with networkx version "
-            f"{nx.__version__}. Please upgrade (or fix) your Python environment."
-        )
-
-    nxver_major = int(version_major)
-    nxver_minor = int(re.match(r"^\d+", version_minor).group())
-    return (nxver_major, nxver_minor)
-
-
-if __name__ == "__main__":
-    from pathlib import Path
-
-    # This script imports nx_cugraph modules, which imports nx_cugraph runtime
-    # dependencies. The modules do not need the runtime deps, so stub them out
-    # to avoid installing them.
-    class Stub:
-        def __getattr__(self, *args, **kwargs):
-            return Stub()
-
-        def __call__(self, *args, **kwargs):
-            return Stub()
-
-    import sys
-
-    sys.modules["cupy"] = Stub()
-    sys.modules["numpy"] = Stub()
-    sys.modules["pylibcugraph"] = Stub()
-
-    from _nx_cugraph.core import main
-
-    filepath = Path(__file__)
-    text = main(filepath)
-    with filepath.open("w") as f:
-        f.write(text)
diff --git a/python/nx-cugraph/_nx_cugraph/_version.py b/python/nx-cugraph/_nx_cugraph/_version.py
deleted file mode 100644
index 3cf8b23da18..00000000000
--- a/python/nx-cugraph/_nx_cugraph/_version.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib.resources
-
-# Read VERSION file from the module that is symlinked to VERSION file
-# in the root of the repo at build time or copied to the module at
-# installation. VERSION is a separate file that allows CI build-time scripts
-# to update version info (including commit hashes) without modifying
-# source files.
-__version__ = (
-    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
-)
-try:
-    __git_commit__ = (
-        importlib.resources.files(__package__)
-        .joinpath("GIT_COMMIT")
-        .read_text()
-        .strip()
-    )
-except FileNotFoundError:
-    __git_commit__ = ""
-
-__all__ = ["__git_commit__", "__version__"]
diff --git a/python/nx-cugraph/_nx_cugraph/core.py b/python/nx-cugraph/_nx_cugraph/core.py
deleted file mode 100644
index 82ce7bc438a..00000000000
--- a/python/nx-cugraph/_nx_cugraph/core.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities to help keep _nx_cugraph up to date."""
-
-
-def get_functions():
-    from nx_cugraph.interface import BackendInterface
-    from nx_cugraph.utils import networkx_algorithm
-
-    return {
-        key: val
-        for key, val in vars(BackendInterface).items()
-        if isinstance(val, networkx_algorithm)
-    }
-
-
-def get_additional_docs(functions=None):
-    if functions is None:
-        functions = get_functions()
-    return {key: val.extra_doc for key, val in functions.items() if val.extra_doc}
-
-
-def get_additional_parameters(functions=None):
-    if functions is None:
-        functions = get_functions()
-    return {key: val.extra_params for key, val in functions.items() if val.extra_params}
-
-
-def update_text(text, lines_to_add, target, indent=" " * 8):
-    begin = f"# BEGIN: {target}\n"
-    end = f"# END: {target}\n"
-    start = text.index(begin)
-    stop = text.index(end)
-    to_add = "\n".join([f"{indent}{line}" for line in lines_to_add])
-    return f"{text[:start]}{begin}{to_add}\n{indent}{text[stop:]}"
-
-
-def dq_repr(s):
-    """Return repr(s) quoted with the double quote preference used by black."""
-    rs = repr(s)
-    if rs.startswith("'") and '"' not in rs:
-        rs = rs.strip("'")
-        return f'"{rs}"'
-    return rs
-
-
-def dict_to_lines(d, *, indent=""):
-    for key in sorted(d):
-        val = d[key]
-        if "\n" not in val:
-            yield f"{indent}{dq_repr(key)}: {dq_repr(val)},"
-        else:
-            yield f"{indent}{dq_repr(key)}: ("
-            *lines, last_line = val.split("\n")
-            for line in lines:
-                line += "\n"
-                yield f"    {indent}{dq_repr(line)}"
-            yield f"    {indent}{dq_repr(last_line)}"
-            yield f"{indent}),"
-
-
-def main(filepath):
-    from pathlib import Path
-
-    filepath = Path(filepath)
-    with filepath.open() as f:
-        orig_text = f.read()
-    text = orig_text
-
-    # Update functions
-    functions = get_functions()
-    to_add = [f'"{name}",' for name in sorted(functions)]
-    text = update_text(text, to_add, "functions")
-
-    # Update additional_docs
-    additional_docs = get_additional_docs(functions)
-    to_add = list(dict_to_lines(additional_docs))
-    text = update_text(text, to_add, "additional_docs")
-
-    # Update additional_parameters
-    additional_parameters = get_additional_parameters(functions)
-    to_add = []
-    for name in sorted(additional_parameters):
-        params = additional_parameters[name]
-        to_add.append(f"{dq_repr(name)}: {{")
-        to_add.extend(dict_to_lines(params, indent=" " * 4))
-        to_add.append("},")
-    text = update_text(text, to_add, "additional_parameters")
-    return text
diff --git a/python/nx-cugraph/conftest.py b/python/nx-cugraph/conftest.py
deleted file mode 100644
index e329b28d81c..00000000000
--- a/python/nx-cugraph/conftest.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--bench",
-        action="store_true",
-        default=False,
-        help="Run benchmarks (sugar for --benchmark-enable) and skip other tests"
-        " (to run both benchmarks AND tests, use --all)",
-    )
-    parser.addoption(
-        "--all",
-        action="store_true",
-        default=False,
-        help="Run benchmarks AND tests (unlike --bench, which only runs benchmarks)",
-    )
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
deleted file mode 100644
index dab2ea70ef1..00000000000
--- a/python/nx-cugraph/lint.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# https://pre-commit.com/
-#
-# Before first use: `pre-commit install`
-# To run: `make lint`
-# To update: `make lint-update`
-#  - &flake8_dependencies below needs updated manually
-fail_fast: false
-default_language_version:
-    python: python3
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
-    hooks:
-      - id: check-added-large-files
-      - id: check-case-conflict
-      - id: check-merge-conflict
-      - id: check-symlinks
-      - id: check-ast
-      - id: check-toml
-      - id: check-yaml
-      - id: debug-statements
-      - id: end-of-file-fixer
-        exclude_types: [svg]
-      - id: mixed-line-ending
-      - id: trailing-whitespace
-  - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.19
-    hooks:
-      - id: validate-pyproject
-        name: Validate pyproject.toml
-  - repo: https://github.com/PyCQA/autoflake
-    rev: v2.3.1
-    hooks:
-      - id: autoflake
-        args: [--in-place]
-  - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.17.0
-    hooks:
-      - id: pyupgrade
-        args: [--py310-plus]
-  - repo: https://github.com/psf/black
-    rev: 24.8.0
-    hooks:
-      - id: black
-      # - id: black-jupyter
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.7
-    hooks:
-      - id: ruff
-        args: [--fix-only, --show-fixes]  # --unsafe-fixes]
-  - repo: https://github.com/PyCQA/flake8
-    rev: 7.1.1
-    hooks:
-      - id: flake8
-        args: ['--per-file-ignores=_nx_cugraph/__init__.py:E501', '--extend-ignore=B020,SIM105']  # Why is this necessary?
-        additional_dependencies: &flake8_dependencies
-          # These versions need updated manually
-          - flake8==7.1.1
-          - flake8-bugbear==24.8.19
-          - flake8-simplify==0.21.0
-  - repo: https://github.com/asottile/yesqa
-    rev: v1.5.0
-    hooks:
-      - id: yesqa
-        additional_dependencies: *flake8_dependencies
-  - repo: https://github.com/codespell-project/codespell
-    rev: v2.3.0
-    hooks:
-      - id: codespell
-        types_or: [python, rst, markdown]
-        additional_dependencies: [tomli]
-        files: ^(nx_cugraph|docs)/
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.7
-    hooks:
-      - id: ruff
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
-    hooks:
-      - id: no-commit-to-branch
-        args: [-p, "^branch-2....$"]
diff --git a/python/nx-cugraph/nx_cugraph/__init__.py b/python/nx-cugraph/nx_cugraph/__init__.py
deleted file mode 100644
index 4404e57f645..00000000000
--- a/python/nx-cugraph/nx_cugraph/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from networkx.exception import *
-
-from _nx_cugraph._version import __git_commit__, __version__
-from _nx_cugraph import _check_networkx_version
-
-_nxver: tuple[int, int] = _check_networkx_version()
-
-from . import utils
-
-from . import classes
-from .classes import *
-
-from . import convert
-from .convert import *
-
-from . import convert_matrix
-from .convert_matrix import *
-
-from . import relabel
-from .relabel import *
-
-from . import generators
-from .generators import *
-
-from . import algorithms
-from .algorithms import *
-
-from .interface import BackendInterface
-
-BackendInterface.Graph = classes.Graph
-BackendInterface.DiGraph = classes.DiGraph
-BackendInterface.MultiGraph = classes.MultiGraph
-BackendInterface.MultiDiGraph = classes.MultiDiGraph
-del BackendInterface
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
deleted file mode 100644
index b4a10bcf0a1..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from . import (
-    bipartite,
-    centrality,
-    cluster,
-    community,
-    components,
-    link_analysis,
-    operators,
-    shortest_paths,
-    traversal,
-    tree,
-)
-from .bipartite import complete_bipartite_graph
-from .centrality import *
-from .cluster import *
-from .components import *
-from .core import *
-from .dag import *
-from .isolate import *
-from .link_analysis import *
-from .operators import *
-from .reciprocity import *
-from .shortest_paths import *
-from .traversal import *
-from .tree.recognition import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
deleted file mode 100644
index bfc7f1d4d42..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .generators import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
deleted file mode 100644
index 214970235c6..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from numbers import Integral
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-from nx_cugraph import _nxver
-from nx_cugraph.generators._utils import _create_using_class, _number_and_nodes
-from nx_cugraph.utils import index_dtype, networkx_algorithm
-
-__all__ = [
-    "complete_bipartite_graph",
-]
-
-
-@networkx_algorithm(nodes_or_number=[0, 1], version_added="23.12")
-def complete_bipartite_graph(n1, n2, create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    orig_n1, unused_nodes1 = n1
-    orig_n2, unused_nodes2 = n2
-    n1, nodes1 = _number_and_nodes(n1)
-    n2, nodes2 = _number_and_nodes(n2)
-    all_indices = cp.indices((n1, n2), dtype=index_dtype)
-    indices0 = all_indices[0].ravel()
-    indices1 = all_indices[1].ravel() + n1
-    del all_indices
-    src_indices = cp.hstack((indices0, indices1))
-    dst_indices = cp.hstack((indices1, indices0))
-    bipartite = cp.zeros(n1 + n2, np.int8)
-    bipartite[n1:] = 1
-    if isinstance(orig_n1, Integral) and isinstance(orig_n2, Integral):
-        nodes = None
-    else:
-        nodes = list(range(n1)) if nodes1 is None else nodes1
-        nodes.extend(range(n2) if nodes2 is None else nodes2)
-        if len(set(nodes)) != len(nodes):
-            raise nx.NetworkXError("Inputs n1 and n2 must contain distinct nodes")
-    if _nxver <= (3, 3):
-        name = f"complete_bipartite_graph({orig_n1}, {orig_n2})"
-    else:
-        name = f"complete_bipartite_graph({n1}, {n2})"
-    G = graph_class.from_coo(
-        n1 + n2,
-        src_indices,
-        dst_indices,
-        node_values={"bipartite": bipartite},
-        id_to_key=nodes,
-        name=name,
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/__init__.py
deleted file mode 100644
index 496dc6aff81..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .betweenness import *
-from .degree_alg import *
-from .eigenvector import *
-from .katz import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
deleted file mode 100644
index f6bb142cded..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import _seed_to_int, networkx_algorithm
-
-__all__ = ["betweenness_centrality", "edge_betweenness_centrality"]
-
-
-@networkx_algorithm(
-    is_incomplete=True,  # weight not supported
-    is_different=True,  # RNG with seed is different
-    version_added="23.10",
-    _plc="betweenness_centrality",
-)
-def betweenness_centrality(
-    G, k=None, normalized=True, weight=None, endpoints=False, seed=None
-):
-    """`weight` parameter is not yet supported, and RNG with seed may be different."""
-    if weight is not None:
-        raise NotImplementedError(
-            "Weighted implementation of betweenness centrality not currently supported"
-        )
-    seed = _seed_to_int(seed)
-    G = _to_graph(G, weight)
-    node_ids, values = plc.betweenness_centrality(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
-        k=k,
-        random_state=seed,
-        normalized=normalized,
-        include_endpoints=endpoints,
-        do_expensive_check=False,
-    )
-    return G._nodearrays_to_dict(node_ids, values)
-
-
-@betweenness_centrality._can_run
-def _(G, k=None, normalized=True, weight=None, endpoints=False, seed=None):
-    return weight is None
-
-
-@networkx_algorithm(
-    is_incomplete=True,  # weight not supported
-    is_different=True,  # RNG with seed is different
-    version_added="23.10",
-    _plc="edge_betweenness_centrality",
-)
-def edge_betweenness_centrality(G, k=None, normalized=True, weight=None, seed=None):
-    """`weight` parameter is not yet supported, and RNG with seed may be different."""
-    if weight is not None:
-        raise NotImplementedError(
-            "Weighted implementation of betweenness centrality not currently supported"
-        )
-    seed = _seed_to_int(seed)
-    G = _to_graph(G, weight)
-    src_ids, dst_ids, values, _edge_ids = plc.edge_betweenness_centrality(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
-        k=k,
-        random_state=seed,
-        normalized=normalized,
-        do_expensive_check=False,
-    )
-    if not G.is_directed():
-        mask = src_ids <= dst_ids
-        src_ids = src_ids[mask]
-        dst_ids = dst_ids[mask]
-        values = 2 * values[mask]
-    return G._edgearrays_to_dict(src_ids, dst_ids, values)
-
-
-@edge_betweenness_centrality._can_run
-def _(G, k=None, normalized=True, weight=None, seed=None):
-    return weight is None
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/degree_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/degree_alg.py
deleted file mode 100644
index 1cc051c698f..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/degree_alg.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from nx_cugraph.convert import _to_directed_graph, _to_graph
-from nx_cugraph.utils import networkx_algorithm, not_implemented_for
-
-__all__ = ["degree_centrality", "in_degree_centrality", "out_degree_centrality"]
-
-
-@networkx_algorithm(version_added="23.12")
-def degree_centrality(G):
-    G = _to_graph(G)
-    if len(G) <= 1:
-        return dict.fromkeys(G, 1)
-    deg = G._degrees_array()
-    centrality = deg * (1 / (len(G) - 1))
-    return G._nodearray_to_dict(centrality)
-
-
-@degree_centrality._should_run
-def _(G):
-    return "Fast algorithm; not worth converting."
-
-
-@not_implemented_for("undirected")
-@networkx_algorithm(version_added="23.12")
-def in_degree_centrality(G):
-    G = _to_directed_graph(G)
-    if len(G) <= 1:
-        return dict.fromkeys(G, 1)
-    deg = G._in_degrees_array()
-    centrality = deg * (1 / (len(G) - 1))
-    return G._nodearray_to_dict(centrality)
-
-
-@in_degree_centrality._should_run
-def _(G):
-    return "Fast algorithm; not worth converting."
-
-
-@not_implemented_for("undirected")
-@networkx_algorithm(version_added="23.12")
-def out_degree_centrality(G):
-    G = _to_directed_graph(G)
-    if len(G) <= 1:
-        return dict.fromkeys(G, 1)
-    deg = G._out_degrees_array()
-    centrality = deg * (1 / (len(G) - 1))
-    return G._nodearray_to_dict(centrality)
-
-
-@out_degree_centrality._should_run
-def _(G):
-    return "Fast algorithm; not worth converting."
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
deleted file mode 100644
index c32b6fbb708..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import (
-    _dtype_param,
-    _get_float_dtype,
-    networkx_algorithm,
-    not_implemented_for,
-)
-
-__all__ = ["eigenvector_centrality"]
-
-
-@not_implemented_for("multigraph")
-@networkx_algorithm(
-    extra_params=_dtype_param,
-    is_incomplete=True,  # nstart not supported
-    version_added="23.12",
-    _plc="eigenvector_centrality",
-)
-def eigenvector_centrality(
-    G, max_iter=100, tol=1.0e-6, nstart=None, weight=None, *, dtype=None
-):
-    """`nstart` parameter is not used, but it is checked for validity."""
-    G = _to_graph(G, weight, 1, np.float32)
-    if len(G) == 0:
-        raise nx.NetworkXPointlessConcept(
-            "cannot compute centrality for the null graph"
-        )
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    if nstart is not None:
-        # Check if given nstart is valid even though we don't use it
-        nstart = G._dict_to_nodearray(nstart, dtype=dtype)
-        if (nstart == 0).all():
-            raise nx.NetworkXError("initial vector cannot have all zero values")
-        if nstart.sum() == 0:
-            raise ZeroDivisionError
-        # nstart /= total  # Uncomment (and assign total) when nstart is used below
-    try:
-        node_ids, values = plc.eigenvector_centrality(
-            resource_handle=plc.ResourceHandle(),
-            graph=G._get_plc_graph(weight, 1, dtype, store_transposed=True),
-            epsilon=tol,
-            max_iterations=max_iter,
-            do_expensive_check=False,
-        )
-    except RuntimeError as exc:
-        # Errors from PLC are sometimes a little scary and not very helpful
-        raise nx.PowerIterationFailedConvergence(max_iter) from exc
-    return G._nodearrays_to_dict(node_ids, values)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
deleted file mode 100644
index 1c6ed61703d..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import (
-    _dtype_param,
-    _get_float_dtype,
-    networkx_algorithm,
-    not_implemented_for,
-)
-
-__all__ = ["katz_centrality"]
-
-
-@not_implemented_for("multigraph")
-@networkx_algorithm(
-    extra_params=_dtype_param,
-    is_incomplete=True,  # nstart and normalized=False not supported
-    version_added="23.12",
-    _plc="katz_centrality",
-)
-def katz_centrality(
-    G,
-    alpha=0.1,
-    beta=1.0,
-    max_iter=1000,
-    tol=1.0e-6,
-    nstart=None,
-    normalized=True,
-    weight=None,
-    *,
-    dtype=None,
-):
-    """`nstart` isn't used (but is checked), and `normalized=False` is not supported."""
-    if not normalized:
-        # Redundant with the `_can_run` check below when being dispatched by NetworkX,
-        # but we raise here in case this funcion is called directly.
-        raise NotImplementedError("normalized=False is not supported.")
-    G = _to_graph(G, weight, 1, np.float32)
-    if (N := len(G)) == 0:
-        return {}
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    if nstart is not None:
-        # Check if given nstart is valid even though we don't use it
-        nstart = G._dict_to_nodearray(nstart, 0, dtype)
-    b = bs = None
-    try:
-        b = float(beta)
-    except (TypeError, ValueError) as exc:
-        try:
-            bs = G._dict_to_nodearray(beta, dtype=dtype)
-            b = 1.0  # float value must be given to PLC (and will be ignored)
-        except (KeyError, ValueError):
-            raise nx.NetworkXError(
-                "beta dictionary must have a value for every node"
-            ) from exc
-    try:
-        node_ids, values = plc.katz_centrality(
-            resource_handle=plc.ResourceHandle(),
-            graph=G._get_plc_graph(weight, 1, dtype, store_transposed=True),
-            betas=bs,
-            alpha=alpha,
-            beta=b,
-            epsilon=N * tol,
-            max_iterations=max_iter,
-            do_expensive_check=False,
-        )
-    except RuntimeError as exc:
-        # Errors from PLC are sometimes a little scary and not very helpful
-        raise nx.PowerIterationFailedConvergence(max_iter) from exc
-    return G._nodearrays_to_dict(node_ids, values)
-
-
-@katz_centrality._can_run
-def _(
-    G,
-    alpha=0.1,
-    beta=1.0,
-    max_iter=1000,
-    tol=1.0e-6,
-    nstart=None,
-    normalized=True,
-    weight=None,
-    *,
-    dtype=None,
-):
-    return normalized
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/cluster.py b/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
deleted file mode 100644
index c355a1bb7c9..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_undirected_graph
-from nx_cugraph.utils import networkx_algorithm, not_implemented_for
-
-__all__ = [
-    "triangles",
-    "average_clustering",
-    "clustering",
-    "transitivity",
-]
-
-
-def _triangles(G, nodes, symmetrize=None):
-    if nodes is not None:
-        if is_single_node := (nodes in G):
-            nodes = [nodes if G.key_to_id is None else G.key_to_id[nodes]]
-        else:
-            nodes = list(nodes)
-        nodes = G._list_to_nodearray(nodes)
-    else:
-        is_single_node = False
-    if len(G) == 0:
-        return None, None, is_single_node
-    node_ids, triangles = plc.triangle_count(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(symmetrize=symmetrize),
-        start_list=nodes,
-        do_expensive_check=False,
-    )
-    return node_ids, triangles, is_single_node
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(version_added="24.02", _plc="triangle_count")
-def triangles(G, nodes=None):
-    G = _to_undirected_graph(G)
-    node_ids, triangles, is_single_node = _triangles(G, nodes)
-    if len(G) == 0:
-        return {}
-    if is_single_node:
-        return int(triangles[0])
-    return G._nodearrays_to_dict(node_ids, triangles)
-
-
-@triangles._should_run
-def _(G, nodes=None):
-    if nodes is None or nodes not in G:
-        return True
-    return "Fast algorithm when computing for a single node; not worth converting."
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
-def clustering(G, nodes=None, weight=None):
-    """Directed graphs and `weight` parameter are not yet supported."""
-    if weight is not None:
-        raise NotImplementedError(
-            "Weighted implementation of clustering not currently supported"
-        )
-    G = _to_undirected_graph(G)
-    node_ids, triangles, is_single_node = _triangles(G, nodes)
-    if len(G) == 0:
-        return {}
-    if is_single_node:
-        numer = int(triangles[0])
-        if numer == 0:
-            return 0
-        degree = int((G.src_indices == nodes).sum())
-        return 2 * numer / (degree * (degree - 1))
-    degrees = G._degrees_array(ignore_selfloops=True)[node_ids]
-    denom = degrees * (degrees - 1)
-    results = 2 * triangles / denom
-    results = cp.where(denom, results, 0)  # 0 where we divided by 0
-    return G._nodearrays_to_dict(node_ids, results)
-
-
-@clustering._can_run
-def _(G, nodes=None, weight=None):
-    return weight is None and not G.is_directed()
-
-
-@clustering._should_run
-def _(G, nodes=None, weight=None):
-    if nodes is None or nodes not in G:
-        return True
-    return "Fast algorithm when computing for a single node; not worth converting."
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
-def average_clustering(G, nodes=None, weight=None, count_zeros=True):
-    """Directed graphs and `weight` parameter are not yet supported."""
-    if weight is not None:
-        raise NotImplementedError(
-            "Weighted implementation of average_clustering not currently supported"
-        )
-    G = _to_undirected_graph(G)
-    node_ids, triangles, is_single_node = _triangles(G, nodes)
-    if len(G) == 0:
-        raise ZeroDivisionError
-    degrees = G._degrees_array(ignore_selfloops=True)[node_ids]
-    if not count_zeros:
-        mask = triangles != 0
-        triangles = triangles[mask]
-        if triangles.size == 0:
-            raise ZeroDivisionError
-        degrees = degrees[mask]
-    denom = degrees * (degrees - 1)
-    results = 2 * triangles / denom
-    if count_zeros:
-        results = cp.where(denom, results, 0)  # 0 where we divided by 0
-    return float(results.mean())
-
-
-@average_clustering._can_run
-def _(G, nodes=None, weight=None, count_zeros=True):
-    return weight is None and not G.is_directed()
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
-def transitivity(G):
-    """Directed graphs are not yet supported."""
-    G = _to_undirected_graph(G)
-    if len(G) == 0:
-        return 0
-    node_ids, triangles = plc.triangle_count(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
-        start_list=None,
-        do_expensive_check=False,
-    )
-    numer = int(triangles.sum())
-    if numer == 0:
-        return 0
-    degrees = G._degrees_array(ignore_selfloops=True)[node_ids]
-    denom = int((degrees * (degrees - 1)).sum())
-    return 2 * numer / denom
-
-
-@transitivity._can_run
-def _(G):
-    # Is transitivity supposed to work on directed graphs?
-    return not G.is_directed()
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/community/__init__.py
deleted file mode 100644
index 51a4f5c195f..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .louvain import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
deleted file mode 100644
index 52c512c454d..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import warnings
-
-import pylibcugraph as plc
-
-from nx_cugraph import _nxver
-from nx_cugraph.convert import _to_undirected_graph
-from nx_cugraph.utils import (
-    _dtype_param,
-    _groupby,
-    _seed_to_int,
-    networkx_algorithm,
-    not_implemented_for,
-)
-
-__all__ = ["louvain_communities"]
-
-# max_level argument was added to NetworkX 3.3
-if _nxver <= (3, 2):
-    _max_level_param = {
-        "max_level : int, optional": (
-            "Upper limit of the number of macro-iterations (max: 500)."
-        )
-    }
-else:
-    _max_level_param = {}
-
-
-def _louvain_communities_nx32(
-    G,
-    weight="weight",
-    resolution=1,
-    threshold=0.0000001,
-    seed=None,
-    *,
-    max_level=None,
-    dtype=None,
-):
-    """`seed` parameter is currently ignored, and self-loops are not yet supported."""
-    return _louvain_communities(
-        G, weight, resolution, threshold, max_level, seed, dtype=dtype
-    )
-
-
-def _louvain_communities(
-    G,
-    weight="weight",
-    resolution=1,
-    threshold=0.0000001,
-    max_level=None,
-    seed=None,
-    *,
-    dtype=None,
-):
-    """`seed` parameter is currently ignored, and self-loops are not yet supported."""
-    # NetworkX allows both directed and undirected, but cugraph only allows undirected.
-    seed = _seed_to_int(seed)  # Unused, but ensure it's valid for future compatibility
-    G = _to_undirected_graph(G, weight)
-    if G.src_indices.size == 0:
-        return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
-    if max_level is None:
-        max_level = 500
-    elif max_level > 500:
-        warnings.warn(
-            f"max_level is set too high (={max_level}), setting it to 500.",
-            UserWarning,
-            stacklevel=2,
-        )
-        max_level = 500
-    node_ids, clusters, modularity = plc.louvain(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(weight, 1, dtype),
-        max_level=max_level,
-        threshold=threshold,
-        resolution=resolution,
-        do_expensive_check=False,
-    )
-    groups = _groupby(clusters, node_ids, groups_are_canonical=True)
-    return [set(G._nodearray_to_list(ids)) for ids in groups.values()]
-
-
-_louvain_decorator = networkx_algorithm(
-    extra_params={
-        **_max_level_param,
-        **_dtype_param,
-    },
-    is_incomplete=True,  # seed not supported; self-loops not supported
-    is_different=True,  # RNG different
-    version_added="23.10",
-    _plc="louvain",
-    name="louvain_communities",
-)
-
-if _max_level_param:  # networkx <= 3.2
-    _louvain_communities_nx32.__name__ = "louvain_communities"
-    louvain_communities = not_implemented_for("directed")(
-        _louvain_decorator(_louvain_communities_nx32)
-    )
-
-    @louvain_communities._can_run
-    def _(
-        G,
-        weight="weight",
-        resolution=1,
-        threshold=0.0000001,
-        seed=None,
-        *,
-        max_level=None,
-        dtype=None,
-    ):
-        # NetworkX allows both directed and undirected, but cugraph only undirected.
-        return not G.is_directed()
-
-else:  # networkx >= 3.3
-    _louvain_communities.__name__ = "louvain_communities"
-    louvain_communities = not_implemented_for("directed")(
-        _louvain_decorator(_louvain_communities)
-    )
-
-    @louvain_communities._can_run
-    def _(
-        G,
-        weight="weight",
-        resolution=1,
-        threshold=0.0000001,
-        max_level=None,
-        seed=None,
-        *,
-        dtype=None,
-    ):
-        # NetworkX allows both directed and undirected, but cugraph only undirected.
-        return not G.is_directed()
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py
deleted file mode 100644
index 12a09b535c0..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .connected import *
-from .strongly_connected import *
-from .weakly_connected import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
deleted file mode 100644
index 24955e3eac8..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_undirected_graph
-from nx_cugraph.utils import _groupby, networkx_algorithm, not_implemented_for
-
-__all__ = [
-    "number_connected_components",
-    "connected_components",
-    "is_connected",
-    "node_connected_component",
-]
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
-def number_connected_components(G):
-    G = _to_undirected_graph(G)
-    return _number_connected_components(G)
-
-
-def _number_connected_components(G, symmetrize=None):
-    if G.src_indices.size == 0:
-        return len(G)
-    unused_node_ids, labels = plc.weakly_connected_components(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(symmetrize=symmetrize),
-        offsets=None,
-        indices=None,
-        weights=None,
-        labels=None,
-        do_expensive_check=False,
-    )
-    return cp.unique(labels).size
-
-
-@number_connected_components._can_run
-def _(G):
-    # NetworkX <= 3.2.1 does not check directedness for us
-    return not G.is_directed()
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
-def connected_components(G):
-    G = _to_undirected_graph(G)
-    return _connected_components(G)
-
-
-def _connected_components(G, symmetrize=None):
-    if G.src_indices.size == 0:
-        return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
-    node_ids, labels = plc.weakly_connected_components(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(symmetrize=symmetrize),
-        offsets=None,
-        indices=None,
-        weights=None,
-        labels=None,
-        do_expensive_check=False,
-    )
-    groups = _groupby(labels, node_ids)
-    return (G._nodearray_to_set(connected_ids) for connected_ids in groups.values())
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
-def is_connected(G):
-    G = _to_undirected_graph(G)
-    return _is_connected(G)
-
-
-def _is_connected(G, symmetrize=None):
-    if len(G) == 0:
-        raise nx.NetworkXPointlessConcept(
-            "Connectivity is undefined for the null graph."
-        )
-    if G.src_indices.size == 0:
-        return len(G) == 1
-    unused_node_ids, labels = plc.weakly_connected_components(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(symmetrize=symmetrize),
-        offsets=None,
-        indices=None,
-        weights=None,
-        labels=None,
-        do_expensive_check=False,
-    )
-    return bool((labels == labels[0]).all())
-
-
-@not_implemented_for("directed")
-@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
-def node_connected_component(G, n):
-    # We could also do plain BFS from n
-    G = _to_undirected_graph(G)
-    node_id = n if G.key_to_id is None else G.key_to_id[n]
-    node_ids, labels = plc.weakly_connected_components(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
-        offsets=None,
-        indices=None,
-        weights=None,
-        labels=None,
-        do_expensive_check=False,
-    )
-    indices = cp.nonzero(node_ids == node_id)[0]
-    if indices.size == 0:
-        return {n}
-    return G._nodearray_to_set(node_ids[labels == labels[indices[0]]])
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
deleted file mode 100644
index a63b3237dfc..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_directed_graph
-from nx_cugraph.utils import _groupby, index_dtype, not_implemented_for
-
-__all__ = [
-    "number_strongly_connected_components",
-    "strongly_connected_components",
-    "is_strongly_connected",
-]
-
-
-def _strongly_connected_components(G):
-    # TODO: create utility function to convert just the indices to CSR
-    # TODO: this uses a legacy PLC function (strongly_connected_components)
-    N = len(G)
-    indices = cp.lexsort(cp.vstack((G.dst_indices, G.src_indices)))
-    dst_indices = G.dst_indices[indices]
-    offsets = cp.searchsorted(
-        G.src_indices, cp.arange(N + 1, dtype=index_dtype), sorter=indices
-    ).astype(index_dtype)
-    labels = cp.zeros(N, dtype=index_dtype)
-    plc.strongly_connected_components(
-        offsets=offsets,
-        indices=dst_indices,
-        weights=None,
-        num_verts=N,
-        num_edges=dst_indices.size,
-        labels=labels,
-    )
-    return labels
-
-
-# The networkx_algorithm decorator is (temporarily) removed to disable
-# dispatching for this function. The current cugraph
-# strongly_connected_components is a legacy implementation with known issues,
-# and in most cases should not be used until the cugraph team can provide an
-# update.
-#
-# Users can still call this via the nx_cugraph module directly:
-# >>> import nx_cugraph as nxcg
-# >>> nxcg.strongly_connected_components(...)
-
-
-@not_implemented_for("undirected")
-# @networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
-def strongly_connected_components(G):
-    G = _to_directed_graph(G)
-    if G.src_indices.size == 0:
-        return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
-    labels = _strongly_connected_components(G)
-    groups = _groupby(labels, cp.arange(len(G), dtype=index_dtype))
-    return (G._nodearray_to_set(connected_ids) for connected_ids in groups.values())
-
-
-@not_implemented_for("undirected")
-# @networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
-def number_strongly_connected_components(G):
-    G = _to_directed_graph(G)
-    if G.src_indices.size == 0:
-        return len(G)
-    labels = _strongly_connected_components(G)
-    return cp.unique(labels).size
-
-
-@not_implemented_for("undirected")
-# @networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
-def is_strongly_connected(G):
-    G = _to_directed_graph(G)
-    if len(G) == 0:
-        raise nx.NetworkXPointlessConcept(
-            "Connectivity is undefined for the null graph."
-        )
-    if G.src_indices.size == 0:
-        return len(G) == 1
-    labels = _strongly_connected_components(G)
-    return bool((labels == labels[0]).all())
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
deleted file mode 100644
index e42acdd3d84..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from nx_cugraph.convert import _to_directed_graph
-from nx_cugraph.utils import networkx_algorithm, not_implemented_for
-
-from .connected import (
-    _connected_components,
-    _is_connected,
-    _number_connected_components,
-)
-
-__all__ = [
-    "number_weakly_connected_components",
-    "weakly_connected_components",
-    "is_weakly_connected",
-]
-
-
-@not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
-def weakly_connected_components(G):
-    G = _to_directed_graph(G)
-    return _connected_components(G, symmetrize="union")
-
-
-@not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
-def number_weakly_connected_components(G):
-    G = _to_directed_graph(G)
-    return _number_connected_components(G, symmetrize="union")
-
-
-@not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
-def is_weakly_connected(G):
-    G = _to_directed_graph(G)
-    return _is_connected(G, symmetrize="union")
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
deleted file mode 100644
index e69ee88a17c..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/core.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import pylibcugraph as plc
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-from nx_cugraph.convert import _to_undirected_graph
-from nx_cugraph.utils import (
-    _get_int_dtype,
-    index_dtype,
-    networkx_algorithm,
-    not_implemented_for,
-)
-
-__all__ = ["core_number", "k_truss"]
-
-
-@not_implemented_for("directed")
-@not_implemented_for("multigraph")
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="core_number")
-def core_number(G):
-    """Directed graphs are not yet supported."""
-    G = _to_undirected_graph(G)
-    if len(G) == 0:
-        return {}
-    if nxcg.number_of_selfloops(G) > 0:
-        raise nx.NetworkXNotImplemented(
-            "Input graph has self loops which is not permitted; "
-            "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
-        )
-    node_ids, core_numbers = plc.core_number(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
-        degree_type="bidirectional",
-        do_expensive_check=False,
-    )
-    return G._nodearrays_to_dict(node_ids, core_numbers)
-
-
-@core_number._can_run
-def _(G):
-    return not G.is_directed()
-
-
-@not_implemented_for("directed")
-@not_implemented_for("multigraph")
-@networkx_algorithm(is_incomplete=True, version_added="23.12", _plc="k_truss_subgraph")
-def k_truss(G, k):
-    if is_nx := isinstance(G, nx.Graph):
-        is_compat_graph = isinstance(G, nxcg.Graph)
-        G = nxcg.from_networkx(G, preserve_all_attrs=True)
-    else:
-        is_compat_graph = False
-    if nxcg.number_of_selfloops(G) > 0:
-        if _nxver <= (3, 2):
-            exc_class = nx.NetworkXError
-        else:
-            exc_class = nx.NetworkXNotImplemented
-        raise exc_class(
-            "Input graph has self loops which is not permitted; "
-            "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
-        )
-
-    # TODO: create renumbering helper function(s)
-    if k < 3:
-        # k-truss graph is comprised of nodes incident on k-2 triangles, so k<3 is a
-        # boundary condition. Here, all we need to do is drop nodes with zero degree.
-        # Technically, it would be okay to delete this branch of code, because
-        # plc.k_truss_subgraph behaves the same for 0 <= k < 3. We keep this branch b/c
-        # it's faster and has an "early return" if there are no nodes with zero degree.
-        degrees = G._degrees_array()
-        # Renumber step 0: node indices
-        node_indices = degrees.nonzero()[0]
-        if degrees.size == node_indices.size:
-            # No change
-            return G if is_nx else G.copy()
-        src_indices = G.src_indices
-        dst_indices = G.dst_indices
-        # Renumber step 1: edge values (no changes needed)
-        edge_values = {key: val.copy() for key, val in G.edge_values.items()}
-        edge_masks = {key: val.copy() for key, val in G.edge_masks.items()}
-    else:
-        edge_dtype = _get_int_dtype(G.src_indices.size - 1)
-        edge_indices = cp.arange(G.src_indices.size, dtype=edge_dtype)
-        src_indices, dst_indices, edge_indices, _ = plc.k_truss_subgraph(
-            resource_handle=plc.ResourceHandle(),
-            graph=G._get_plc_graph(edge_array=edge_indices),
-            k=k,
-            do_expensive_check=False,
-        )
-        # Renumber step 0: node indices
-        node_indices = cp.unique(cp.concatenate([src_indices, dst_indices]))
-        # Renumber step 1: edge values
-        if edge_indices.dtype != edge_dtype:
-            # The returned edge_indices may have different dtype (and float)
-            edge_indices = edge_indices.astype(edge_dtype)
-        edge_values = {key: val[edge_indices] for key, val in G.edge_values.items()}
-        edge_masks = {key: val[edge_indices] for key, val in G.edge_masks.items()}
-    # Renumber step 2: edge indices
-    src_indices = cp.searchsorted(node_indices, src_indices).astype(index_dtype)
-    dst_indices = cp.searchsorted(node_indices, dst_indices).astype(index_dtype)
-    # Renumber step 3: node values
-    node_values = {key: val[node_indices] for key, val in G.node_values.items()}
-    node_masks = {key: val[node_indices] for key, val in G.node_masks.items()}
-    # Renumber step 4: key_to_id
-    if (id_to_key := G.id_to_key) is not None:
-        key_to_id = {
-            id_to_key[old_index]: new_index
-            for new_index, old_index in enumerate(node_indices.tolist())
-        }
-    else:
-        key_to_id = None
-    # Same as calling `G.from_coo`, but use __class__ to indicate it's a classmethod.
-    new_graph = G.__class__.from_coo(
-        node_indices.size,
-        src_indices,
-        dst_indices,
-        edge_values,
-        edge_masks,
-        node_values,
-        node_masks,
-        key_to_id=key_to_id,
-        use_compat_graph=is_compat_graph,
-    )
-    new_graph.graph.update(G.graph)
-    return new_graph
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/dag.py b/python/nx-cugraph/nx_cugraph/algorithms/dag.py
deleted file mode 100644
index 64be0a58105..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/dag.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import index_dtype, networkx_algorithm
-
-__all__ = [
-    "descendants",
-    "ancestors",
-]
-
-
-def _ancestors_and_descendants(G, source, *, is_ancestors):
-    G = _to_graph(G)
-    if source not in G:
-        hash(source)  # To raise TypeError if appropriate
-        raise nx.NetworkXError(
-            f"The node {source} is not in the {G.__class__.__name__.lower()}."
-        )
-    src_index = source if G.key_to_id is None else G.key_to_id[source]
-    distances, predecessors, node_ids = plc.bfs(
-        handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(switch_indices=is_ancestors),
-        sources=cp.array([src_index], dtype=index_dtype),
-        direction_optimizing=False,
-        depth_limit=-1,
-        compute_predecessors=False,
-        do_expensive_check=False,
-    )
-    mask = (distances != np.iinfo(distances.dtype).max) & (distances != 0)
-    return G._nodearray_to_set(node_ids[mask])
-
-
-@networkx_algorithm(version_added="24.02", _plc="bfs")
-def descendants(G, source):
-    return _ancestors_and_descendants(G, source, is_ancestors=False)
-
-
-@networkx_algorithm(version_added="24.02", _plc="bfs")
-def ancestors(G, source):
-    return _ancestors_and_descendants(G, source, is_ancestors=True)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/isolate.py b/python/nx-cugraph/nx_cugraph/algorithms/isolate.py
deleted file mode 100644
index 47a349bcf31..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/isolate.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import cupy as cp
-import numpy as np
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import index_dtype, networkx_algorithm
-
-if TYPE_CHECKING:  # pragma: no cover
-    from nx_cugraph.typing import IndexValue
-
-__all__ = ["is_isolate", "isolates", "number_of_isolates"]
-
-
-@networkx_algorithm(version_added="23.10")
-def is_isolate(G, n):
-    G = _to_graph(G)
-    index = n if G.key_to_id is None else G.key_to_id[n]
-    return not (
-        (G.src_indices == index).any().tolist()
-        or G.is_directed()
-        and (G.dst_indices == index).any().tolist()
-    )
-
-
-@is_isolate._should_run
-def _(G, n):
-    return "Fast algorithm; not worth converting."
-
-
-def _mark_isolates(G, symmetrize=None) -> cp.ndarray[bool]:
-    """Return a boolean mask array indicating indices of isolated nodes."""
-    mark_isolates = cp.ones(len(G), bool)
-    if G.is_directed() and symmetrize == "intersection":
-        N = G._N
-        # Upcast to int64 so indices don't overflow
-        src_dst = N * G.src_indices.astype(np.int64) + G.dst_indices
-        src_dst_T = G.src_indices + N * G.dst_indices.astype(np.int64)
-        src_dst_new = cp.intersect1d(src_dst, src_dst_T)
-        new_indices = cp.floor_divide(src_dst_new, N, dtype=index_dtype)
-        mark_isolates[new_indices] = False
-    else:
-        mark_isolates[G.src_indices] = False
-        if G.is_directed():
-            mark_isolates[G.dst_indices] = False
-    return mark_isolates
-
-
-def _isolates(G, symmetrize=None) -> cp.ndarray[IndexValue]:
-    """Like isolates, but return an array of indices instead of an iterator of nodes."""
-    G = _to_graph(G)
-    return cp.nonzero(_mark_isolates(G, symmetrize=symmetrize))[0]
-
-
-@networkx_algorithm(version_added="23.10")
-def isolates(G):
-    G = _to_graph(G)
-    return G._nodeiter_to_iter(iter(_isolates(G).tolist()))
-
-
-@networkx_algorithm(version_added="23.10")
-def number_of_isolates(G):
-    G = _to_graph(G)
-    return int(cp.count_nonzero(_mark_isolates(G)))
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/__init__.py
deleted file mode 100644
index a68d6940d02..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .hits_alg import *
-from .pagerank_alg import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
deleted file mode 100644
index cc59fd5eb64..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-from nx_cugraph import _nxver
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import (
-    _dtype_param,
-    _get_float_dtype,
-    index_dtype,
-    networkx_algorithm,
-)
-
-__all__ = ["hits"]
-
-
-@networkx_algorithm(
-    extra_params={
-        'weight : string or None, optional (default="weight")': (
-            "The edge attribute to use as the edge weight."
-        ),
-        **_dtype_param,
-    },
-    version_added="23.12",
-    _plc="hits",
-)
-def hits(
-    G,
-    max_iter=100,
-    tol=1.0e-8,
-    nstart=None,
-    normalized=True,
-    *,
-    weight="weight",
-    dtype=None,
-):
-    G = _to_graph(G, weight, 1, np.float32)
-    if (N := len(G)) == 0:
-        return {}, {}
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    if nstart is not None:
-        nstart = G._dict_to_nodearray(nstart, 0, dtype)
-    if max_iter <= 0:
-        if _nxver <= (3, 2):
-            raise ValueError("`maxiter` must be a positive integer.")
-        raise nx.PowerIterationFailedConvergence(max_iter)
-    try:
-        node_ids, hubs, authorities = plc.hits(
-            resource_handle=plc.ResourceHandle(),
-            graph=G._get_plc_graph(weight, 1, dtype, store_transposed=True),
-            tol=tol,
-            initial_hubs_guess_vertices=(
-                None if nstart is None else cp.arange(N, dtype=index_dtype)
-            ),
-            initial_hubs_guess_values=nstart,
-            max_iter=max_iter,
-            normalized=normalized,
-            do_expensive_check=False,
-        )
-    except RuntimeError as exc:
-        # Errors from PLC are sometimes a little scary and not very helpful
-        raise nx.PowerIterationFailedConvergence(max_iter) from exc
-    return (
-        G._nodearrays_to_dict(node_ids, hubs),
-        G._nodearrays_to_dict(node_ids, authorities),
-    )
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
deleted file mode 100644
index 41203a2bc22..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import (
-    _dtype_param,
-    _get_float_dtype,
-    index_dtype,
-    networkx_algorithm,
-)
-
-__all__ = ["pagerank"]
-
-
-@networkx_algorithm(
-    extra_params=_dtype_param,
-    is_incomplete=True,  # dangling not supported
-    version_added="23.12",
-    _plc={"pagerank", "personalized_pagerank"},
-)
-def pagerank(
-    G,
-    alpha=0.85,
-    personalization=None,
-    max_iter=100,
-    tol=1.0e-6,
-    nstart=None,
-    weight="weight",
-    dangling=None,
-    *,
-    dtype=None,
-):
-    """`dangling` parameter is not supported, but it is checked for validity."""
-    G = _to_graph(G, weight, 1, np.float32)
-    if (N := len(G)) == 0:
-        return {}
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    if nstart is not None:
-        nstart = G._dict_to_nodearray(nstart, 0, dtype=dtype)
-        if (total := nstart.sum()) == 0:
-            raise ZeroDivisionError
-        nstart /= total
-    if personalization is not None:
-        personalization = G._dict_to_nodearray(personalization, 0, dtype=dtype)
-        if (total := personalization.sum()) == 0:
-            raise ZeroDivisionError
-        personalization /= total
-    if dangling is not None:
-        # Check if given dangling is valid even though we don't use it
-        dangling = G._dict_to_nodearray(dangling, 0)  # Check validity
-        if dangling.sum() == 0:
-            raise ZeroDivisionError
-        if (G._out_degrees_array() == 0).any():
-            raise NotImplementedError("custom dangling weights is not supported")
-    if max_iter <= 0:
-        raise nx.PowerIterationFailedConvergence(max_iter)
-    kwargs = {
-        "resource_handle": plc.ResourceHandle(),
-        "graph": G._get_plc_graph(weight, 1, dtype, store_transposed=True),
-        "precomputed_vertex_out_weight_vertices": None,
-        "precomputed_vertex_out_weight_sums": None,
-        "initial_guess_vertices": (
-            None if nstart is None else cp.arange(N, dtype=index_dtype)
-        ),
-        "initial_guess_values": nstart,
-        "alpha": alpha,
-        "epsilon": N * tol,
-        "max_iterations": max_iter,
-        "do_expensive_check": False,
-        "fail_on_nonconvergence": False,
-    }
-    if personalization is None:
-        node_ids, values, is_converged = plc.pagerank(**kwargs)
-    else:
-        node_ids, values, is_converged = plc.personalized_pagerank(
-            personalization_vertices=cp.arange(N, dtype=index_dtype),  # Why?
-            personalization_values=personalization,
-            **kwargs,
-        )
-    if not is_converged:
-        raise nx.PowerIterationFailedConvergence(max_iter)
-    return G._nodearrays_to_dict(node_ids, values)
-
-
-@pagerank._can_run
-def _(
-    G,
-    alpha=0.85,
-    personalization=None,
-    max_iter=100,
-    tol=1.0e-6,
-    nstart=None,
-    weight="weight",
-    dangling=None,
-    *,
-    dtype=None,
-):
-    return dangling is None
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py
deleted file mode 100644
index 32fd45f5726..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .unary import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
deleted file mode 100644
index 75dc5fbc706..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-import nx_cugraph as nxcg
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import index_dtype, networkx_algorithm
-
-__all__ = ["complement", "reverse"]
-
-
-@networkx_algorithm(version_added="24.02")
-def complement(G):
-    is_compat_graph = isinstance(G, nxcg.Graph)
-    G = _to_graph(G)
-    N = G._N
-    # Upcast to int64 so indices don't overflow.
-    edges_a_b = N * G.src_indices.astype(np.int64) + G.dst_indices
-    # Now compute flattened indices for all edges except self-loops
-    # Alt (slower):
-    # edges_full = np.arange(N * N)
-    # edges_full = edges_full[(edges_full % (N + 1)).astype(bool)]
-    edges_full = cp.arange(1, N * (N - 1) + 1) + cp.repeat(cp.arange(N - 1), N)
-    edges_comp = cp.setdiff1d(
-        edges_full,
-        edges_a_b,
-        assume_unique=not G.is_multigraph(),
-    )
-    src_indices, dst_indices = cp.divmod(edges_comp, N)
-    return G.__class__.from_coo(
-        N,
-        src_indices.astype(index_dtype),
-        dst_indices.astype(index_dtype),
-        key_to_id=G.key_to_id,
-        use_compat_graph=is_compat_graph,
-    )
-
-
-@networkx_algorithm(version_added="24.02")
-def reverse(G, copy=True):
-    if not G.is_directed():
-        raise nx.NetworkXError("Cannot reverse an undirected graph.")
-    if isinstance(G, nx.Graph):
-        is_compat_graph = isinstance(G, nxcg.Graph)
-        if not copy and not is_compat_graph:
-            raise RuntimeError(
-                "Using `copy=False` is invalid when using a NetworkX graph "
-                "as input to `nx_cugraph.reverse`"
-            )
-        G = nxcg.from_networkx(G, preserve_all_attrs=True)
-    else:
-        is_compat_graph = False
-    rv = G.reverse(copy=copy)
-    if is_compat_graph:
-        return rv._to_compat_graph()
-    return rv
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/reciprocity.py b/python/nx-cugraph/nx_cugraph/algorithms/reciprocity.py
deleted file mode 100644
index c87abdf9fa7..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/reciprocity.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-from nx_cugraph.convert import _to_directed_graph
-from nx_cugraph.utils import networkx_algorithm, not_implemented_for
-
-__all__ = ["reciprocity", "overall_reciprocity"]
-
-
-@not_implemented_for("undirected", "multigraph")
-@networkx_algorithm(version_added="24.02")
-def reciprocity(G, nodes=None):
-    if nodes is None:
-        return overall_reciprocity(G)
-    G = _to_directed_graph(G)
-    N = G._N
-    # 'nodes' can also be a single node identifier
-    if nodes in G:
-        index = nodes if G.key_to_id is None else G.key_to_id[nodes]
-        mask = (G.src_indices == index) | (G.dst_indices == index)
-        src_indices = G.src_indices[mask]
-        if src_indices.size == 0:
-            raise nx.NetworkXError("Not defined for isolated nodes.")
-        dst_indices = G.dst_indices[mask]
-        # Create two lists of edge identifiers, one for each direction.
-        # Edge identifiers can be created from a pair of node
-        # identifiers. Simply adding src IDs to dst IDs is not adequate, so
-        # make one set of values (either src or dst depending on direction)
-        # unique by multiplying values by N.
-        # Upcast to int64 so indices don't overflow.
-        edges_a_b = N * src_indices.astype(np.int64) + dst_indices
-        edges_b_a = src_indices + N * dst_indices.astype(np.int64)
-        # Find the matching edge identifiers in each list. The edge identifier
-        # generation ensures the ID for A->B == the ID for B->A
-        recip_indices = cp.intersect1d(
-            edges_a_b,
-            edges_b_a,
-            # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
-        )
-        num_selfloops = (src_indices == dst_indices).sum().tolist()
-        return (recip_indices.size - num_selfloops) / edges_a_b.size
-
-    # Don't include self-loops
-    mask = G.src_indices != G.dst_indices
-    src_indices = G.src_indices[mask]
-    dst_indices = G.dst_indices[mask]
-    # Create two lists of edges, one for each direction, and find the matching
-    # IDs in each list (see description above).
-    edges_a_b = N * src_indices.astype(np.int64) + dst_indices
-    edges_b_a = src_indices + N * dst_indices.astype(np.int64)
-    recip_indices = cp.intersect1d(
-        edges_a_b,
-        edges_b_a,
-        # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
-    )
-    numer = cp.bincount(recip_indices // N, minlength=N)
-    denom = cp.bincount(src_indices, minlength=N)
-    denom += cp.bincount(dst_indices, minlength=N)
-    recip = 2 * numer / denom
-    node_ids = G._nodekeys_to_nodearray(nodes)
-    return G._nodearrays_to_dict(node_ids, recip[node_ids])
-
-
-@not_implemented_for("undirected", "multigraph")
-@networkx_algorithm(version_added="24.02")
-def overall_reciprocity(G):
-    G = _to_directed_graph(G)
-    if G.number_of_edges() == 0:
-        raise nx.NetworkXError("Not defined for empty graphs")
-    # Create two lists of edges, one for each direction, and find the matching
-    # IDs in each list (see description in reciprocity()).
-    edges_a_b = G._N * G.src_indices.astype(np.int64) + G.dst_indices
-    edges_b_a = G.src_indices + G._N * G.dst_indices.astype(np.int64)
-    recip_indices = cp.intersect1d(
-        edges_a_b,
-        edges_b_a,
-        # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
-    )
-    num_selfloops = (G.src_indices == G.dst_indices).sum().tolist()
-    return (recip_indices.size - num_selfloops) / edges_a_b.size
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/__init__.py
deleted file mode 100644
index 9d87389a98e..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .generic import *
-from .unweighted import *
-from .weighted import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
deleted file mode 100644
index ab3c7214303..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import numpy as np
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import _dtype_param, _get_float_dtype, networkx_algorithm
-
-from .unweighted import _bfs
-from .weighted import _sssp
-
-__all__ = [
-    "shortest_path",
-    "shortest_path_length",
-    "has_path",
-]
-
-
-@networkx_algorithm(version_added="24.04", _plc="bfs")
-def has_path(G, source, target):
-    # TODO PERF: make faster in core
-    try:
-        nxcg.bidirectional_shortest_path(G, source, target)
-    except nx.NetworkXNoPath:
-        return False
-    return True
-
-
-@networkx_algorithm(
-    extra_params=_dtype_param, version_added="24.04", _plc={"bfs", "sssp"}
-)
-def shortest_path(
-    G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
-):
-    """Negative weights are not yet supported."""
-    if method not in {"dijkstra", "bellman-ford"}:
-        raise ValueError(f"method not supported: {method}")
-    if weight is None:
-        method = "unweighted"
-    if source is None:
-        if target is None:
-            # All pairs
-            if method == "unweighted":
-                paths = nxcg.all_pairs_shortest_path(G)
-            elif method == "dijkstra":
-                paths = nxcg.all_pairs_dijkstra_path(G, weight=weight, dtype=dtype)
-            else:  # method == 'bellman-ford':
-                paths = nxcg.all_pairs_bellman_ford_path(G, weight=weight, dtype=dtype)
-            if _nxver <= (3, 4):
-                paths = dict(paths)
-        # To target
-        elif method == "unweighted":
-            paths = nxcg.single_target_shortest_path(G, target)
-        else:
-            # method == "dijkstra":
-            # method == 'bellman-ford':
-            # XXX: it seems weird that `reverse_path=True` is necessary here
-            G = _to_graph(G, weight, 1, np.float32)
-            dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-            paths = _sssp(
-                G, target, weight, return_type="path", dtype=dtype, reverse_path=True
-            )
-    elif target is None:
-        # From source
-        if method == "unweighted":
-            paths = nxcg.single_source_shortest_path(G, source)
-        elif method == "dijkstra":
-            paths = nxcg.single_source_dijkstra_path(
-                G, source, weight=weight, dtype=dtype
-            )
-        else:  # method == 'bellman-ford':
-            paths = nxcg.single_source_bellman_ford_path(
-                G, source, weight=weight, dtype=dtype
-            )
-    # From source to target
-    elif method == "unweighted":
-        paths = nxcg.bidirectional_shortest_path(G, source, target)
-    else:
-        # method == "dijkstra":
-        # method == 'bellman-ford':
-        paths = nxcg.bellman_ford_path(G, source, target, weight, dtype=dtype)
-    return paths
-
-
-@shortest_path._can_run
-def _(G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-@networkx_algorithm(
-    extra_params=_dtype_param, version_added="24.04", _plc={"bfs", "sssp"}
-)
-def shortest_path_length(
-    G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
-):
-    """Negative weights are not yet supported."""
-    if method not in {"dijkstra", "bellman-ford"}:
-        raise ValueError(f"method not supported: {method}")
-    if weight is None:
-        method = "unweighted"
-    if source is None:
-        if target is None:
-            # All pairs
-            if method == "unweighted":
-                lengths = nxcg.all_pairs_shortest_path_length(G)
-            elif method == "dijkstra":
-                lengths = nxcg.all_pairs_dijkstra_path_length(
-                    G, weight=weight, dtype=dtype
-                )
-            else:  # method == 'bellman-ford':
-                lengths = nxcg.all_pairs_bellman_ford_path_length(
-                    G, weight=weight, dtype=dtype
-                )
-        # To target
-        elif method == "unweighted":
-            lengths = nxcg.single_target_shortest_path_length(G, target)
-            if _nxver <= (3, 4):
-                lengths = dict(lengths)
-        elif method == "dijkstra":
-            lengths = nxcg.single_source_dijkstra_path_length(
-                G, target, weight=weight, dtype=dtype
-            )
-        else:  # method == 'bellman-ford':
-            lengths = nxcg.single_source_bellman_ford_path_length(
-                G, target, weight=weight, dtype=dtype
-            )
-    elif target is None:
-        # From source
-        if method == "unweighted":
-            lengths = nxcg.single_source_shortest_path_length(G, source)
-        elif method == "dijkstra":
-            lengths = nxcg.single_source_dijkstra_path_length(
-                G, source, weight=weight, dtype=dtype
-            )
-        else:  # method == 'bellman-ford':
-            lengths = nxcg.single_source_bellman_ford_path_length(
-                G, source, weight=weight, dtype=dtype
-            )
-    # From source to target
-    elif method == "unweighted":
-        G = _to_graph(G)
-        lengths = _bfs(G, source, None, "Source", return_type="length", target=target)
-    elif method == "dijkstra":
-        lengths = nxcg.dijkstra_path_length(G, source, target, weight, dtype=dtype)
-    else:  # method == 'bellman-ford':
-        lengths = nxcg.bellman_ford_path_length(G, source, target, weight, dtype=dtype)
-    return lengths
-
-
-@shortest_path_length._can_run
-def _(G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
deleted file mode 100644
index e9c515632ca..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-from nx_cugraph import _nxver
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm
-
-__all__ = [
-    "bidirectional_shortest_path",
-    "single_source_shortest_path",
-    "single_source_shortest_path_length",
-    "single_target_shortest_path",
-    "single_target_shortest_path_length",
-    "all_pairs_shortest_path",
-    "all_pairs_shortest_path_length",
-]
-
-concat = itertools.chain.from_iterable
-
-
-@networkx_algorithm(version_added="23.12", _plc="bfs")
-def single_source_shortest_path_length(G, source, cutoff=None):
-    G = _to_graph(G)
-    return _bfs(G, source, cutoff, "Source", return_type="length")
-
-
-@networkx_algorithm(version_added="23.12", _plc="bfs")
-def single_target_shortest_path_length(G, target, cutoff=None):
-    G = _to_graph(G)
-    rv = _bfs(G, target, cutoff, "Target", return_type="length")
-    if _nxver <= (3, 4):
-        return iter(rv.items())
-    return rv
-
-
-@networkx_algorithm(version_added="24.04", _plc="bfs")
-def all_pairs_shortest_path_length(G, cutoff=None):
-    # TODO PERF: batched bfs to compute many at once
-    G = _to_graph(G)
-    for n in G:
-        yield (n, _bfs(G, n, cutoff, "Source", return_type="length"))
-
-
-@networkx_algorithm(version_added="24.04", _plc="bfs")
-def bidirectional_shortest_path(G, source, target):
-    # TODO PERF: do bidirectional traversal in core
-    G = _to_graph(G)
-    if source not in G or target not in G:
-        if _nxver <= (3, 3):
-            raise nx.NodeNotFound(
-                f"Either source {source} or target {target} is not in G"
-            )
-        missing = f"Source {source}" if source not in G else f"Target {target}"
-        raise nx.NodeNotFound(f"{missing} is not in G")
-    return _bfs(G, source, None, "Source", return_type="path", target=target)
-
-
-@networkx_algorithm(version_added="24.04", _plc="bfs")
-def single_source_shortest_path(G, source, cutoff=None):
-    G = _to_graph(G)
-    return _bfs(G, source, cutoff, "Source", return_type="path")
-
-
-@networkx_algorithm(version_added="24.04", _plc="bfs")
-def single_target_shortest_path(G, target, cutoff=None):
-    G = _to_graph(G)
-    return _bfs(G, target, cutoff, "Target", return_type="path", reverse_path=True)
-
-
-@networkx_algorithm(version_added="24.04", _plc="bfs")
-def all_pairs_shortest_path(G, cutoff=None):
-    # TODO PERF: batched bfs to compute many at once
-    G = _to_graph(G)
-    for n in G:
-        yield (n, _bfs(G, n, cutoff, "Source", return_type="path"))
-
-
-def _bfs(
-    G, source, cutoff, kind, *, return_type, reverse_path=False, target=None, scale=None
-):
-    """BFS for unweighted shortest path algorithms.
-
-    Parameters
-    ----------
-    source: node label
-
-    cutoff: int, optional
-
-    kind: {"Source", "Target"}
-
-    return_type: {"length", "path", "length-path"}
-
-    reverse_path: bool
-
-    target: node label
-
-    scale: int or float, optional
-        The amount to scale the lengths
-    """
-    # DRY: _sssp in weighted.py has similar code
-    if source not in G:
-        # Different message to pass networkx tests
-        if return_type == "length":
-            raise nx.NodeNotFound(f"{kind} {source} is not in G")
-        raise nx.NodeNotFound(f"{kind} {source} not in G")
-    if target is not None:
-        if source == target or cutoff is not None and cutoff <= 0:
-            if return_type == "path":
-                return [source]
-            if return_type == "length":
-                return 0
-            # return_type == "length-path"
-            return 0, [source]
-        if target not in G or G.src_indices.size == 0:
-            raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
-    elif G.src_indices.size == 0 or cutoff is not None and cutoff <= 0:
-        if return_type == "path":
-            return {source: [source]}
-        if return_type == "length":
-            return {source: 0}
-        # return_type == "length-path"
-        return {source: 0}, {source: [source]}
-
-    if cutoff is None or np.isinf(cutoff):
-        cutoff = -1
-    src_index = source if G.key_to_id is None else G.key_to_id[source]
-    distances, predecessors, node_ids = plc.bfs(
-        handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(switch_indices=kind == "Target"),
-        sources=cp.array([src_index], index_dtype),
-        direction_optimizing=False,  # True for undirected only; what's recommended?
-        depth_limit=cutoff,
-        compute_predecessors=return_type != "length",
-        do_expensive_check=False,
-    )
-    mask = distances != np.iinfo(distances.dtype).max
-    node_ids = node_ids[mask]
-    if return_type != "path":
-        lengths = distances = distances[mask]
-        if scale is not None:
-            lengths = scale * lengths
-        lengths = G._nodearrays_to_dict(node_ids, lengths)
-        if target is not None:
-            if target not in lengths:
-                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
-            lengths = lengths[target]
-    if return_type != "length":
-        if target is not None:
-            d = dict(zip(node_ids.tolist(), predecessors[mask].tolist()))
-            dst_index = target if G.key_to_id is None else G.key_to_id[target]
-            if dst_index not in d:
-                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
-            cur = dst_index
-            paths = [dst_index]
-            while cur != src_index:
-                cur = d[cur]
-                paths.append(cur)
-            if (id_to_key := G.id_to_key) is not None:
-                if reverse_path:
-                    paths = [id_to_key[cur] for cur in paths]
-                else:
-                    paths = [id_to_key[cur] for cur in reversed(paths)]
-            elif not reverse_path:
-                paths.reverse()
-        else:
-            if return_type == "path":
-                distances = distances[mask]
-            groups = _groupby(distances, [predecessors[mask], node_ids])
-
-            # `pred_node_iter` does the equivalent as these nested for loops:
-            # for length in range(1, len(groups)):
-            #     preds, nodes = groups[length]
-            #     for pred, node in zip(preds.tolist(), nodes.tolist()):
-            if G.key_to_id is None:
-                pred_node_iter = concat(
-                    zip(*(x.tolist() for x in groups[length]))
-                    for length in range(1, len(groups))
-                )
-            else:
-                pred_node_iter = concat(
-                    zip(*(G._nodeiter_to_iter(x.tolist()) for x in groups[length]))
-                    for length in range(1, len(groups))
-                )
-            # Consider making utility functions for creating paths
-            paths = {source: [source]}
-            if reverse_path:
-                for pred, node in pred_node_iter:
-                    paths[node] = [node, *paths[pred]]
-            else:
-                for pred, node in pred_node_iter:
-                    paths[node] = [*paths[pred], node]
-    if return_type == "path":
-        return paths
-    if return_type == "length":
-        return lengths
-    # return_type == "length-path"
-    return lengths, paths
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
deleted file mode 100644
index 032ef2c7fdf..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import (
-    _dtype_param,
-    _get_float_dtype,
-    _groupby,
-    networkx_algorithm,
-)
-
-from .unweighted import _bfs
-
-__all__ = [
-    "dijkstra_path",
-    "dijkstra_path_length",
-    "single_source_dijkstra",
-    "single_source_dijkstra_path",
-    "single_source_dijkstra_path_length",
-    "all_pairs_dijkstra",
-    "all_pairs_dijkstra_path",
-    "all_pairs_dijkstra_path_length",
-    "bellman_ford_path",
-    "bellman_ford_path_length",
-    "single_source_bellman_ford",
-    "single_source_bellman_ford_path",
-    "single_source_bellman_ford_path_length",
-    "all_pairs_bellman_ford_path",
-    "all_pairs_bellman_ford_path_length",
-]
-
-
-def _add_doc(func):
-    func.__doc__ = (
-        "Negative cycles are not yet supported. ``NotImplementedError`` will be raised "
-        "if there are negative edge weights. We plan to support negative edge weights "
-        "soon. Also, callable ``weight`` argument is not supported."
-    )
-    return func
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def dijkstra_path(G, source, target, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, target, return_type="path", dtype=dtype)
-
-
-@dijkstra_path._can_run
-def _(G, source, target, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def bellman_ford_path(G, source, target, weight="weight", *, dtype=None):
-    return dijkstra_path(G, source, target, weight=weight, dtype=dtype)
-
-
-@bellman_ford_path._can_run
-def _(G, source, target, weight="weight", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def dijkstra_path_length(G, source, target, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, target, return_type="length", dtype=dtype)
-
-
-@dijkstra_path._can_run
-def _(G, source, target, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def bellman_ford_path_length(G, source, target, weight="weight", *, dtype=None):
-    return dijkstra_path_length(G, source, target, weight=weight, dtype=dtype)
-
-
-@bellman_ford_path_length._can_run
-def _(G, source, target, weight="weight", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def single_source_dijkstra_path(G, source, cutoff=None, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, return_type="path", dtype=dtype, cutoff=cutoff)
-
-
-@single_source_dijkstra_path._can_run
-def _(G, source, cutoff=None, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def single_source_bellman_ford_path(G, source, weight="weight", *, dtype=None):
-    return single_source_dijkstra_path(G, source, weight=weight, dtype=dtype)
-
-
-@single_source_bellman_ford_path._can_run
-def _(G, source, weight="weight", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def single_source_dijkstra_path_length(
-    G, source, cutoff=None, weight="weight", *, dtype=None
-):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, return_type="length", dtype=dtype, cutoff=cutoff)
-
-
-@single_source_dijkstra_path_length._can_run
-def _(G, source, cutoff=None, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def single_source_bellman_ford_path_length(G, source, weight="weight", *, dtype=None):
-    return single_source_dijkstra_path_length(G, source, weight=weight, dtype=dtype)
-
-
-@single_source_bellman_ford_path_length._can_run
-def _(G, source, weight="weight", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def single_source_dijkstra(
-    G, source, target=None, cutoff=None, weight="weight", *, dtype=None
-):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(
-        G, source, weight, target, return_type="length-path", dtype=dtype, cutoff=cutoff
-    )
-
-
-@single_source_dijkstra._can_run
-def _(G, source, target=None, cutoff=None, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def single_source_bellman_ford(G, source, target=None, weight="weight", *, dtype=None):
-    return single_source_dijkstra(G, source, target=target, weight=weight, dtype=dtype)
-
-
-@single_source_bellman_ford._can_run
-def _(G, source, target=None, weight="weight", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def all_pairs_dijkstra(G, cutoff=None, weight="weight", *, dtype=None):
-    # TODO PERF: batched bfs to compute many at once
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    for n in G:
-        yield (
-            n,
-            _sssp(G, n, weight, return_type="length-path", dtype=dtype, cutoff=cutoff),
-        )
-
-
-@all_pairs_dijkstra._can_run
-def _(G, cutoff=None, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def all_pairs_dijkstra_path_length(G, cutoff=None, weight="weight", *, dtype=None):
-    # TODO PERF: batched bfs to compute many at once
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    for n in G:
-        yield (n, _sssp(G, n, weight, return_type="length", dtype=dtype, cutoff=cutoff))
-
-
-@all_pairs_dijkstra_path_length._can_run
-def _(G, cutoff=None, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def all_pairs_bellman_ford_path_length(G, weight="weight", *, dtype=None):
-    return all_pairs_dijkstra_path_length(G, weight=weight, dtype=None)
-
-
-@all_pairs_bellman_ford_path_length._can_run
-def _(G, weight="weight", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
-def all_pairs_dijkstra_path(G, cutoff=None, weight="weight", *, dtype=None):
-    # TODO PERF: batched bfs to compute many at once
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    for n in G:
-        yield (n, _sssp(G, n, weight, return_type="path", dtype=dtype, cutoff=cutoff))
-
-
-@all_pairs_dijkstra_path._can_run
-def _(G, cutoff=None, weight="weight", *, dtype=None):
-    return not callable(weight)
-
-
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def all_pairs_bellman_ford_path(G, weight="weight", *, dtype=None):
-    return all_pairs_dijkstra_path(G, weight=weight, dtype=None)
-
-
-@all_pairs_bellman_ford_path._can_run
-def _(G, weight="weight", *, dtype=None):
-    return (
-        weight is None
-        or not callable(weight)
-        and not nx.is_negatively_weighted(G, weight=weight)
-    )
-
-
-def _sssp(
-    G,
-    source,
-    weight,
-    target=None,
-    *,
-    return_type,
-    dtype,
-    reverse_path=False,
-    cutoff=None,
-):
-    """SSSP for weighted shortest paths.
-
-    Parameters
-    ----------
-    return_type : {"length", "path", "length-path"}
-
-    """
-    # DRY: _bfs in unweighted.py has similar code
-    if source not in G:
-        raise nx.NodeNotFound(f"Node {source} not found in graph")
-    if target is not None:
-        if source == target:
-            if return_type == "path":
-                return [source]
-            if return_type == "length":
-                return 0
-            # return_type == "length-path"
-            return 0, [source]
-        if target not in G or G.src_indices.size == 0:
-            raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
-    elif G.src_indices.size == 0:
-        if return_type == "path":
-            return {source: [source]}
-        if return_type == "length":
-            return {source: 0}
-        # return_type == "length-path"
-        return {source: 0}, {source: [source]}
-
-    if callable(weight):
-        raise NotImplementedError("callable `weight` argument is not supported")
-
-    if weight not in G.edge_values:
-        # No edge values, so use BFS instead
-        return _bfs(G, source, cutoff, "Source", return_type=return_type, target=target)
-
-    # Check for negative values since we don't support negative cycles
-    edge_vals = G.edge_values[weight]
-    if weight in G.edge_masks:
-        edge_vals = edge_vals[G.edge_masks[weight]]
-    if (edge_vals < 0).any():
-        raise NotImplementedError("Negative edge weights not yet supported")
-    edge_val = edge_vals[0]
-    if (edge_vals == edge_val).all() and (
-        edge_vals.size == G.src_indices.size or edge_val == 1
-    ):
-        # Edge values are all the same, so use scaled BFS instead
-        return _bfs(
-            G,
-            source,
-            None if cutoff is None else cutoff / edge_val,
-            "Source",
-            return_type=return_type,
-            target=target,
-            scale=edge_val,
-            reverse_path=reverse_path,
-        )
-
-    src_index = source if G.key_to_id is None else G.key_to_id[source]
-    if cutoff is None:
-        cutoff = np.inf
-    else:
-        cutoff = np.nextafter(cutoff, np.inf, dtype=np.float64)
-
-    node_ids, distances, predecessors = plc.sssp(
-        resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(weight, 1, dtype),
-        source=src_index,
-        cutoff=cutoff,
-        compute_predecessors=True,  # TODO: False is not yet supported
-        # compute_predecessors=return_type != "length",
-        do_expensive_check=False,
-    )
-    mask = distances != np.finfo(distances.dtype).max
-    node_ids = node_ids[mask]
-    if return_type != "path":
-        lengths = G._nodearrays_to_dict(node_ids, distances[mask])
-        if target is not None:
-            if target not in lengths:
-                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
-            lengths = lengths[target]
-    if return_type != "length":
-        if target is not None:
-            d = dict(zip(node_ids.tolist(), predecessors[mask].tolist()))
-            dst_index = target if G.key_to_id is None else G.key_to_id[target]
-            if dst_index not in d:
-                raise nx.NetworkXNoPath(f"Node {target} not reachable from {source}")
-            cur = dst_index
-            paths = [dst_index]
-            while cur != src_index:
-                cur = d[cur]
-                paths.append(cur)
-            if (id_to_key := G.id_to_key) is not None:
-                if reverse_path:
-                    paths = [id_to_key[cur] for cur in paths]
-                else:
-                    paths = [id_to_key[cur] for cur in reversed(paths)]
-            elif not reverse_path:
-                paths.reverse()
-        else:
-            groups = _groupby(predecessors[mask], node_ids)
-            if (id_to_key := G.id_to_key) is not None:
-                groups = {id_to_key[k]: v for k, v in groups.items() if k >= 0}
-            paths = {source: [source]}
-            preds = [source]
-            while preds:
-                pred = preds.pop()
-                pred_path = paths[pred]
-                nodes = G._nodearray_to_list(groups[pred])
-                if reverse_path:
-                    for node in nodes:
-                        paths[node] = [node, *pred_path]
-                else:
-                    for node in nodes:
-                        paths[node] = [*pred_path, node]
-                preds.extend(nodes & groups.keys())
-    if return_type == "path":
-        return paths
-    if return_type == "length":
-        return lengths
-    # return_type == "length-path"
-    return lengths, paths
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/__init__.py
deleted file mode 100644
index 1751cd46919..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/traversal/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .breadth_first_search import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
deleted file mode 100644
index 72d0079cf0c..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from itertools import repeat
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm
-
-__all__ = [
-    "bfs_edges",
-    "bfs_tree",
-    "bfs_predecessors",
-    "bfs_successors",
-    "descendants_at_distance",
-    "bfs_layers",
-    "generic_bfs_edges",
-]
-
-
-def _check_G_and_source(G, source):
-    G = _to_graph(G)
-    if source not in G:
-        hash(source)  # To raise TypeError if appropriate
-        raise nx.NetworkXError(
-            f"The node {source} is not in the {G.__class__.__name__.lower()}."
-        )
-    return G
-
-
-def _bfs(G, source, *, depth_limit=None, reverse=False):
-    src_index = source if G.key_to_id is None else G.key_to_id[source]
-    distances, predecessors, node_ids = plc.bfs(
-        handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(switch_indices=reverse),
-        sources=cp.array([src_index], dtype=index_dtype),
-        direction_optimizing=False,
-        depth_limit=-1 if depth_limit is None else depth_limit,
-        compute_predecessors=True,
-        do_expensive_check=False,
-    )
-    mask = predecessors >= 0
-    return distances[mask], predecessors[mask], node_ids[mask]
-
-
-if _nxver <= (3, 3):
-
-    @networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
-    def generic_bfs_edges(
-        G, source, neighbors=None, depth_limit=None, sort_neighbors=None
-    ):
-        """`neighbors` and `sort_neighbors` parameters are not yet supported."""
-        if neighbors is not None:
-            raise NotImplementedError(
-                "neighbors argument in generic_bfs_edges is not currently supported"
-            )
-        if sort_neighbors is not None:
-            raise NotImplementedError(
-                "sort_neighbors argument in generic_bfs_edges is not supported"
-            )
-        return bfs_edges(G, source, depth_limit=depth_limit)
-
-    @generic_bfs_edges._can_run
-    def _(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
-        return neighbors is None and sort_neighbors is None
-
-else:
-
-    @networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
-    def generic_bfs_edges(G, source, neighbors=None, depth_limit=None):
-        """`neighbors` parameter is not yet supported."""
-        if neighbors is not None:
-            raise NotImplementedError(
-                "neighbors argument in generic_bfs_edges is not currently supported"
-            )
-        return bfs_edges(G, source, depth_limit=depth_limit)
-
-    @generic_bfs_edges._can_run
-    def _(G, source, neighbors=None, depth_limit=None):
-        return neighbors is None
-
-
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
-def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
-    """`sort_neighbors` parameter is not yet supported."""
-    if sort_neighbors is not None:
-        raise NotImplementedError(
-            "sort_neighbors argument in bfs_edges is not currently supported"
-        )
-    G = _check_G_and_source(G, source)
-    if depth_limit is not None and depth_limit < 1:
-        return
-    distances, predecessors, node_ids = _bfs(
-        G, source, depth_limit=depth_limit, reverse=reverse
-    )
-    # Using groupby like this is similar to bfs_predecessors
-    groups = _groupby([distances, predecessors], node_ids)
-    id_to_key = G.id_to_key
-    for key in sorted(groups):
-        children_ids = groups[key]
-        parent_id = key[1]
-        parent = id_to_key[parent_id] if id_to_key is not None else parent_id
-        yield from zip(
-            repeat(parent, children_ids.size),
-            G._nodeiter_to_iter(children_ids.tolist()),
-        )
-
-
-@bfs_edges._can_run
-def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
-    return sort_neighbors is None
-
-
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
-def bfs_tree(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
-    """`sort_neighbors` parameter is not yet supported."""
-    if sort_neighbors is not None:
-        raise NotImplementedError(
-            "sort_neighbors argument in bfs_tree is not currently supported"
-        )
-    is_compat_graph = isinstance(G, nxcg.Graph)
-    G = _check_G_and_source(G, source)
-    if depth_limit is not None and depth_limit < 1:
-        return nxcg.CudaDiGraph.from_coo(
-            1,
-            cp.array([], dtype=index_dtype),
-            cp.array([], dtype=index_dtype),
-            id_to_key=[source],
-            use_compat_graph=is_compat_graph,
-        )
-
-    distances, predecessors, node_ids = _bfs(
-        G,
-        source,
-        depth_limit=depth_limit,
-        reverse=reverse,
-    )
-    if predecessors.size == 0:
-        return nxcg.CudaDiGraph.from_coo(
-            1,
-            cp.array([], dtype=index_dtype),
-            cp.array([], dtype=index_dtype),
-            id_to_key=[source],
-            use_compat_graph=is_compat_graph,
-        )
-    # TODO: create renumbering helper function(s)
-    unique_node_ids = cp.unique(cp.hstack((predecessors, node_ids)))
-    # Renumber edges
-    src_indices = cp.searchsorted(unique_node_ids, predecessors).astype(index_dtype)
-    dst_indices = cp.searchsorted(unique_node_ids, node_ids).astype(index_dtype)
-    # Renumber nodes
-    if (id_to_key := G.id_to_key) is not None:
-        key_to_id = {
-            id_to_key[old_index]: new_index
-            for new_index, old_index in enumerate(unique_node_ids.tolist())
-        }
-    else:
-        key_to_id = {
-            old_index: new_index
-            for new_index, old_index in enumerate(unique_node_ids.tolist())
-        }
-    return nxcg.CudaDiGraph.from_coo(
-        unique_node_ids.size,
-        src_indices,
-        dst_indices,
-        key_to_id=key_to_id,
-        use_compat_graph=is_compat_graph,
-    )
-
-
-@bfs_tree._can_run
-def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
-    return sort_neighbors is None
-
-
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
-def bfs_successors(G, source, depth_limit=None, sort_neighbors=None):
-    """`sort_neighbors` parameter is not yet supported."""
-    if sort_neighbors is not None:
-        raise NotImplementedError(
-            "sort_neighbors argument in bfs_successors is not currently supported"
-        )
-    G = _check_G_and_source(G, source)
-    if depth_limit is not None and depth_limit < 1:
-        yield (source, [])
-        return
-
-    distances, predecessors, node_ids = _bfs(G, source, depth_limit=depth_limit)
-    groups = _groupby([distances, predecessors], node_ids)
-    id_to_key = G.id_to_key
-    for key in sorted(groups):
-        children_ids = groups[key]
-        parent_id = key[1]
-        parent = id_to_key[parent_id] if id_to_key is not None else parent_id
-        children = G._nodearray_to_list(children_ids)
-        yield (parent, children)
-
-
-@bfs_successors._can_run
-def _(G, source, depth_limit=None, sort_neighbors=None):
-    return sort_neighbors is None
-
-
-@networkx_algorithm(version_added="24.02", _plc="bfs")
-def bfs_layers(G, sources):
-    G = _to_graph(G)
-    if sources in G:
-        sources = [sources]
-    else:
-        sources = set(sources)
-        if not all(source in G for source in sources):
-            node = next(source for source in sources if source not in G)
-            raise nx.NetworkXError(f"The node {node} is not in the graph.")
-        sources = list(sources)
-    source_ids = G._list_to_nodearray(sources)
-    distances, predecessors, node_ids = plc.bfs(
-        handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
-        sources=source_ids,
-        direction_optimizing=False,
-        depth_limit=-1,
-        compute_predecessors=False,
-        do_expensive_check=False,
-    )
-    mask = distances != np.iinfo(distances.dtype).max
-    distances = distances[mask]
-    node_ids = node_ids[mask]
-    groups = _groupby(distances, node_ids)
-    return (G._nodearray_to_list(groups[key]) for key in range(len(groups)))
-
-
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
-def bfs_predecessors(G, source, depth_limit=None, sort_neighbors=None):
-    """`sort_neighbors` parameter is not yet supported."""
-    if sort_neighbors is not None:
-        raise NotImplementedError(
-            "sort_neighbors argument in bfs_predecessors is not currently supported"
-        )
-    G = _check_G_and_source(G, source)
-    if depth_limit is not None and depth_limit < 1:
-        return
-
-    distances, predecessors, node_ids = _bfs(G, source, depth_limit=depth_limit)
-    # We include `predecessors` in the groupby for "nicer" iteration order
-    groups = _groupby([distances, predecessors], node_ids)
-    id_to_key = G.id_to_key
-    for key in sorted(groups):
-        children_ids = groups[key]
-        parent_id = key[1]
-        parent = id_to_key[parent_id] if id_to_key is not None else parent_id
-        yield from zip(
-            G._nodeiter_to_iter(children_ids.tolist()),
-            repeat(parent, children_ids.size),
-        )
-
-
-@bfs_predecessors._can_run
-def _(G, source, depth_limit=None, sort_neighbors=None):
-    return sort_neighbors is None
-
-
-@networkx_algorithm(version_added="24.02", _plc="bfs")
-def descendants_at_distance(G, source, distance):
-    G = _check_G_and_source(G, source)
-    if distance is None or distance < 0:
-        return set()
-    if distance == 0:
-        return {source}
-
-    src_index = source if G.key_to_id is None else G.key_to_id[source]
-    distances, predecessors, node_ids = plc.bfs(
-        handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
-        sources=cp.array([src_index], dtype=index_dtype),
-        direction_optimizing=False,
-        depth_limit=distance,
-        compute_predecessors=False,
-        do_expensive_check=False,
-    )
-    mask = distances == distance
-    node_ids = node_ids[mask]
-    return G._nodearray_to_set(node_ids)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/tree/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/tree/__init__.py
deleted file mode 100644
index 91bf72417be..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/tree/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .recognition import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py b/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
deleted file mode 100644
index 74f57b5ea5a..00000000000
--- a/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-
-import nx_cugraph as nxcg
-from nx_cugraph.convert import _to_directed_graph, _to_graph
-from nx_cugraph.utils import networkx_algorithm, not_implemented_for
-
-__all__ = ["is_arborescence", "is_branching", "is_forest", "is_tree"]
-
-
-@not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
-def is_arborescence(G):
-    G = _to_directed_graph(G)
-    return is_tree(G) and int(G._in_degrees_array().max()) <= 1
-
-
-@not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
-def is_branching(G):
-    G = _to_directed_graph(G)
-    return is_forest(G) and int(G._in_degrees_array().max()) <= 1
-
-
-@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
-def is_forest(G):
-    G = _to_graph(G)
-    if len(G) == 0:
-        raise nx.NetworkXPointlessConcept("G has no nodes.")
-    if is_directed := G.is_directed():
-        connected_components = nxcg.weakly_connected_components
-    else:
-        connected_components = nxcg.connected_components
-    for components in connected_components(G):
-        node_ids = G._list_to_nodearray(list(components))
-        # TODO: create utilities for creating subgraphs
-        mask = cp.isin(G.src_indices, node_ids) & cp.isin(G.dst_indices, node_ids)
-        # A tree must have an edge count equal to the number of nodes minus the
-        # tree's root node.
-        if is_directed:
-            if int(cp.count_nonzero(mask)) != len(components) - 1:
-                return False
-        else:
-            src_indices = G.src_indices[mask]
-            dst_indices = G.dst_indices[mask]
-            if int(cp.count_nonzero(src_indices <= dst_indices)) != len(components) - 1:
-                return False
-    return True
-
-
-@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
-def is_tree(G):
-    G = _to_graph(G)
-    if len(G) == 0:
-        raise nx.NetworkXPointlessConcept("G has no nodes.")
-    if G.is_directed():
-        is_connected = nxcg.is_weakly_connected
-    else:
-        is_connected = nxcg.is_connected
-    # A tree must have an edge count equal to the number of nodes minus the
-    # tree's root node.
-    return len(G) - 1 == G.number_of_edges() and is_connected(G)
diff --git a/python/nx-cugraph/nx_cugraph/classes/__init__.py b/python/nx-cugraph/nx_cugraph/classes/__init__.py
deleted file mode 100644
index 71168e5364f..00000000000
--- a/python/nx-cugraph/nx_cugraph/classes/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .graph import CudaGraph, Graph
-from .digraph import CudaDiGraph, DiGraph
-from .multigraph import CudaMultiGraph, MultiGraph
-from .multidigraph import CudaMultiDiGraph, MultiDiGraph
-
-from .function import *
diff --git a/python/nx-cugraph/nx_cugraph/classes/digraph.py b/python/nx-cugraph/nx_cugraph/classes/digraph.py
deleted file mode 100644
index 178bf44f16e..00000000000
--- a/python/nx-cugraph/nx_cugraph/classes/digraph.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-from copy import deepcopy
-from typing import TYPE_CHECKING
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-from networkx.classes.digraph import (
-    _CachedPropertyResetterAdjAndSucc,
-    _CachedPropertyResetterPred,
-)
-
-import nx_cugraph as nxcg
-
-from ..utils import index_dtype
-from .graph import CudaGraph, Graph
-
-if TYPE_CHECKING:  # pragma: no cover
-    from nx_cugraph.typing import AttrKey
-
-__all__ = ["CudaDiGraph", "DiGraph"]
-
-networkx_api = nxcg.utils.decorators.networkx_class(nx.DiGraph)
-
-
-class DiGraph(nx.DiGraph, Graph):
-    _nx_attrs = ("_node", "_adj", "_succ", "_pred")
-
-    name = Graph.name
-    _node = Graph._node
-
-    @property
-    @networkx_api
-    def _adj(self):
-        if (adj := self.__dict__["_adj"]) is None:
-            self._reify_networkx()
-            adj = self.__dict__["_adj"]
-        return adj
-
-    @_adj.setter
-    def _adj(self, val):
-        self._prepare_setter()
-        _CachedPropertyResetterAdjAndSucc.__set__(None, self, val)
-        if cache := getattr(self, "__networkx_cache__", None):
-            cache.clear()
-
-    @property
-    @networkx_api
-    def _succ(self):
-        if (succ := self.__dict__["_succ"]) is None:
-            self._reify_networkx()
-            succ = self.__dict__["_succ"]
-        return succ
-
-    @_succ.setter
-    def _succ(self, val):
-        self._prepare_setter()
-        _CachedPropertyResetterAdjAndSucc.__set__(None, self, val)
-        if cache := getattr(self, "__networkx_cache__", None):
-            cache.clear()
-
-    @property
-    @networkx_api
-    def _pred(self):
-        if (pred := self.__dict__["_pred"]) is None:
-            self._reify_networkx()
-            pred = self.__dict__["_pred"]
-        return pred
-
-    @_pred.setter
-    def _pred(self, val):
-        self._prepare_setter()
-        _CachedPropertyResetterPred.__set__(None, self, val)
-        if cache := getattr(self, "__networkx_cache__", None):
-            cache.clear()
-
-    @classmethod
-    @networkx_api
-    def is_directed(cls) -> bool:
-        return True
-
-    @classmethod
-    @networkx_api
-    def is_multigraph(cls) -> bool:
-        return False
-
-    @classmethod
-    def to_cudagraph_class(cls) -> type[CudaDiGraph]:
-        return CudaDiGraph
-
-    @classmethod
-    def to_networkx_class(cls) -> type[nx.DiGraph]:
-        return nx.DiGraph
-
-
-class CudaDiGraph(CudaGraph):
-    #################
-    # Class methods #
-    #################
-
-    is_directed = classmethod(DiGraph.is_directed.__func__)
-    is_multigraph = classmethod(DiGraph.is_multigraph.__func__)
-    to_cudagraph_class = classmethod(DiGraph.to_cudagraph_class.__func__)
-    to_networkx_class = classmethod(DiGraph.to_networkx_class.__func__)
-
-    @classmethod
-    def _to_compat_graph_class(cls) -> type[DiGraph]:
-        return DiGraph
-
-    @networkx_api
-    def size(self, weight: AttrKey | None = None) -> int:
-        if weight is not None:
-            raise NotImplementedError
-        return self.src_indices.size
-
-    ##########################
-    # NetworkX graph methods #
-    ##########################
-
-    @networkx_api
-    def reverse(self, copy: bool = True) -> CudaDiGraph:
-        return self._copy(not copy, self.__class__, reverse=True)
-
-    @networkx_api
-    def to_undirected(self, reciprocal=False, as_view=False):
-        N = self._N
-        # Upcast to int64 so indices don't overflow
-        src_dst_indices_old = N * self.src_indices.astype(np.int64) + self.dst_indices
-        if reciprocal:
-            src_dst_indices_new = cp.intersect1d(
-                src_dst_indices_old,
-                self.src_indices + N * self.dst_indices.astype(np.int64),
-                # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
-            )
-            if self.edge_values:
-                sorter = cp.argsort(src_dst_indices_old)
-                idx = cp.searchsorted(
-                    src_dst_indices_old, src_dst_indices_new, sorter=sorter
-                )
-                indices = sorter[idx]
-                src_indices = self.src_indices[indices].copy()
-                dst_indices = self.dst_indices[indices].copy()
-                edge_values = {
-                    key: val[indices].copy() for key, val in self.edge_values.items()
-                }
-                edge_masks = {
-                    key: val[indices].copy() for key, val in self.edge_masks.items()
-                }
-            else:
-                src_indices, dst_indices = cp.divmod(src_dst_indices_new, N)
-                src_indices = src_indices.astype(index_dtype)
-                dst_indices = dst_indices.astype(index_dtype)
-        else:
-            src_dst_indices_old_T = self.src_indices + N * self.dst_indices.astype(
-                np.int64
-            )
-            if self.edge_values:
-                src_dst_extra = cp.setdiff1d(
-                    src_dst_indices_old_T, src_dst_indices_old, assume_unique=True
-                )
-                sorter = cp.argsort(src_dst_indices_old_T)
-                idx = cp.searchsorted(
-                    src_dst_indices_old_T, src_dst_extra, sorter=sorter
-                )
-                indices = sorter[idx]
-                src_indices = cp.hstack((self.src_indices, self.dst_indices[indices]))
-                dst_indices = cp.hstack((self.dst_indices, self.src_indices[indices]))
-                edge_values = {
-                    key: cp.hstack((val, val[indices]))
-                    for key, val in self.edge_values.items()
-                }
-                edge_masks = {
-                    key: cp.hstack((val, val[indices]))
-                    for key, val in self.edge_masks.items()
-                }
-            else:
-                src_dst_indices_new = cp.union1d(
-                    src_dst_indices_old, src_dst_indices_old_T
-                )
-                src_indices, dst_indices = cp.divmod(src_dst_indices_new, N)
-                src_indices = src_indices.astype(index_dtype)
-                dst_indices = dst_indices.astype(index_dtype)
-
-        if self.edge_values:
-            recip_indices = cp.lexsort(cp.vstack((src_indices, dst_indices)))
-            for key, mask in edge_masks.items():
-                # Make sure we choose a value that isn't masked out
-                val = edge_values[key]
-                rmask = mask[recip_indices]
-                recip_only = rmask & ~mask
-                val[recip_only] = val[recip_indices[recip_only]]
-                only = mask & ~rmask
-                val[recip_indices[only]] = val[only]
-                mask |= mask[recip_indices]
-            # Arbitrarily choose to use value from (j > i) edge
-            mask = src_indices < dst_indices
-            left_idx = cp.nonzero(mask)[0]
-            right_idx = recip_indices[mask]
-            for val in edge_values.values():
-                val[left_idx] = val[right_idx]
-        else:
-            edge_values = {}
-            edge_masks = {}
-
-        node_values = self.node_values
-        node_masks = self.node_masks
-        key_to_id = self.key_to_id
-        id_to_key = None if key_to_id is None else self._id_to_key
-        if not as_view:
-            node_values = {key: val.copy() for key, val in node_values.items()}
-            node_masks = {key: val.copy() for key, val in node_masks.items()}
-            if key_to_id is not None:
-                key_to_id = key_to_id.copy()
-                if id_to_key is not None:
-                    id_to_key = id_to_key.copy()
-        rv = self.to_undirected_class().from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            use_compat_graph=False,
-        )
-        if as_view:
-            rv.graph = self.graph
-        else:
-            rv.graph.update(deepcopy(self.graph))
-        return rv
-
-    # Many more methods to implement...
-
-    ###################
-    # Private methods #
-    ###################
-
-    def _in_degrees_array(self, *, ignore_selfloops=False):
-        dst_indices = self.dst_indices
-        if ignore_selfloops:
-            not_selfloops = self.src_indices != dst_indices
-            dst_indices = dst_indices[not_selfloops]
-        if dst_indices.size == 0:
-            return cp.zeros(self._N, dtype=np.int64)
-        return cp.bincount(dst_indices, minlength=self._N)
-
-    def _out_degrees_array(self, *, ignore_selfloops=False):
-        src_indices = self.src_indices
-        if ignore_selfloops:
-            not_selfloops = src_indices != self.dst_indices
-            src_indices = src_indices[not_selfloops]
-        if src_indices.size == 0:
-            return cp.zeros(self._N, dtype=np.int64)
-        return cp.bincount(src_indices, minlength=self._N)
diff --git a/python/nx-cugraph/nx_cugraph/classes/function.py b/python/nx-cugraph/nx_cugraph/classes/function.py
deleted file mode 100644
index 55cbf19aa7a..00000000000
--- a/python/nx-cugraph/nx_cugraph/classes/function.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-
-from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import networkx_algorithm
-
-__all__ = [
-    "is_negatively_weighted",
-    "number_of_selfloops",
-]
-
-
-@networkx_algorithm(version_added="24.04")
-def is_negatively_weighted(G, edge=None, weight="weight"):
-    G = _to_graph(G, weight)
-    if edge is not None:
-        data = G.get_edge_data(*edge)
-        if data is None:
-            raise nx.NetworkXError(f"Edge {edge!r} does not exist.")
-        return weight in data and data[weight] < 0
-    if weight not in G.edge_values:
-        return False
-    edge_vals = G.edge_values[weight]
-    if weight in G.edge_masks:
-        edge_vals = edge_vals[G.edge_masks[weight]]
-    return bool((edge_vals < 0).any())
-
-
-@networkx_algorithm(version_added="23.12")
-def number_of_selfloops(G):
-    G = _to_graph(G)
-    is_selfloop = G.src_indices == G.dst_indices
-    return int(cp.count_nonzero(is_selfloop))
-
-
-@number_of_selfloops._should_run
-def _(G):
-    return "Fast algorithm; not worth converting."
diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py
deleted file mode 100644
index cfe1e1c87e9..00000000000
--- a/python/nx-cugraph/nx_cugraph/classes/graph.py
+++ /dev/null
@@ -1,1147 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import operator as op
-from copy import deepcopy
-from typing import TYPE_CHECKING
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-from networkx.classes.graph import (
-    _CachedPropertyResetterAdj,
-    _CachedPropertyResetterNode,
-)
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from ..utils import index_dtype
-
-if TYPE_CHECKING:  # pragma: no cover
-    from collections.abc import Iterable, Iterator
-    from typing import ClassVar
-
-    from nx_cugraph.typing import (
-        AttrKey,
-        Dtype,
-        EdgeTuple,
-        EdgeValue,
-        IndexValue,
-        NodeKey,
-        NodeValue,
-        any_ndarray,
-    )
-
-__all__ = ["CudaGraph", "Graph"]
-
-networkx_api = nxcg.utils.decorators.networkx_class(nx.Graph)
-
-# The "everything" cache key is an internal implementation detail of NetworkX
-# that may change between releases.
-if _nxver < (3, 4):
-    _CACHE_KEY = (
-        True,  # Include all edge values
-        True,  # Include all node values
-        True,  # Include `.graph` attributes
-    )
-else:
-    _CACHE_KEY = (
-        True,  # Include all edge values
-        True,  # Include all node values
-        # `.graph` attributes are always included now
-    )
-
-# Use to indicate when a full conversion to GPU failed so we don't try again.
-_CANT_CONVERT_TO_GPU = "_CANT_CONVERT_TO_GPU"
-
-
-# `collections.UserDict` was the preferred way to subclass dict, but now
-# subclassing dict directly is much better supported and should work here.
-# This class should only be necessary if the user clears the cache manually.
-class _GraphCache(dict):
-    """Cache that ensures Graph will reify into a NetworkX graph when cleared."""
-
-    _graph: Graph
-
-    def __init__(self, graph: Graph):
-        self._graph = graph
-
-    def clear(self) -> None:
-        self._graph._reify_networkx()
-        super().clear()
-
-
-class Graph(nx.Graph):
-    # Tell networkx to dispatch calls with this object to nx-cugraph
-    __networkx_backend__: ClassVar[str] = "cugraph"  # nx >=3.2
-    __networkx_plugin__: ClassVar[str] = "cugraph"  # nx <3.2
-
-    # Core attributes of NetowkrX graphs that will be copied and cleared as appropriate.
-    # These attributes comprise the edge and node data model for NetworkX graphs.
-    _nx_attrs = ("_node", "_adj")
-
-    # Allow networkx dispatch machinery to cache conversions.
-    # This means we should clear the cache if we ever mutate the object!
-    __networkx_cache__: _GraphCache | None
-
-    # networkx properties
-    graph: dict
-    # Should we declare type annotations for the rest?
-
-    # Properties that trigger copying to the CPU
-    def _prepare_setter(self):
-        """Be careful when setting private attributes which may be used during init."""
-        if (
-            # If not present, then this must be in init
-            any(attr not in self.__dict__ for attr in self._nx_attrs)
-            # Already on the CPU
-            or not any(self.__dict__[attr] is None for attr in self._nx_attrs)
-        ):
-            return
-        if self._is_on_gpu:
-            # Copy from GPU to CPU
-            self._reify_networkx()
-            return
-        # Default values
-        for attr in self._nx_attrs:
-            if self.__dict__[attr] is None:
-                if attr == "_succ":
-                    self.__dict__[attr] = self.__dict__["_adj"]
-                else:
-                    self.__dict__[attr] = {}
-
-    @property
-    @networkx_api
-    def _node(self):
-        if (node := self.__dict__["_node"]) is None:
-            self._reify_networkx()
-            node = self.__dict__["_node"]
-        return node
-
-    @_node.setter
-    def _node(self, val):
-        self._prepare_setter()
-        _CachedPropertyResetterNode.__set__(None, self, val)
-        if cache := getattr(self, "__networkx_cache__", None):
-            cache.clear()
-
-    @property
-    @networkx_api
-    def _adj(self):
-        if (adj := self.__dict__["_adj"]) is None:
-            self._reify_networkx()
-            adj = self.__dict__["_adj"]
-        return adj
-
-    @_adj.setter
-    def _adj(self, val):
-        self._prepare_setter()
-        _CachedPropertyResetterAdj.__set__(None, self, val)
-        if cache := getattr(self, "__networkx_cache__", None):
-            cache.clear()
-
-    @property
-    def _is_on_gpu(self) -> bool:
-        """Whether the full graph is on device (in the cache).
-
-        This returns False when only a subset of the graph (such as only
-        edge indices and edge attribute) is on device.
-
-        The graph may be on host (CPU) and device (GPU) at the same time.
-        """
-        cache = getattr(self, "__networkx_cache__", None)
-        if not cache:
-            return False
-        return _CACHE_KEY in cache.get("backends", {}).get("cugraph", {})
-
-    @property
-    def _is_on_cpu(self) -> bool:
-        """Whether the graph is on host as a NetworkX graph.
-
-        This means the core data structures that comprise a NetworkX graph
-        (such as ``G._node`` and ``G._adj``) are present.
-
-        The graph may be on host (CPU) and device (GPU) at the same time.
-        """
-        return self.__dict__["_node"] is not None
-
-    @property
-    def _cudagraph(self):
-        """Return the full ``CudaGraph`` on device, computing if necessary, or None."""
-        nx_cache = getattr(self, "__networkx_cache__", None)
-        if nx_cache is None:
-            nx_cache = {}
-        elif _CANT_CONVERT_TO_GPU in nx_cache:
-            return None
-        cache = nx_cache.setdefault("backends", {}).setdefault("cugraph", {})
-        if (Gcg := cache.get(_CACHE_KEY)) is not None:
-            if isinstance(Gcg, Graph):
-                # This shouldn't happen during normal use, but be extra-careful anyway
-                return Gcg._cudagraph
-            return Gcg
-        if self.__dict__["_node"] is None:
-            raise RuntimeError(
-                f"{type(self).__name__} cannot be converted to the GPU, because it is "
-                "not on the CPU! This is not supposed to be possible. If you believe "
-                "you have found a bug, please report a minimum reproducible example to "
-                "https://github.com/rapidsai/cugraph/issues/new/choose"
-            )
-        try:
-            Gcg = nxcg.from_networkx(
-                self, preserve_edge_attrs=True, preserve_node_attrs=True
-            )
-        except Exception:
-            # Should we warn that the full graph can't be on GPU?
-            nx_cache[_CANT_CONVERT_TO_GPU] = True
-            return None
-        Gcg.graph = self.graph
-        cache[_CACHE_KEY] = Gcg
-        return Gcg
-
-    @_cudagraph.setter
-    def _cudagraph(self, val, *, clear_cpu=True):
-        """Set the full ``CudaGraph`` for this graph, or remove from device if None."""
-        if (cache := getattr(self, "__networkx_cache__", None)) is None:
-            # Should we warn?
-            return
-        # TODO: pay close attention to when we should clear the cache, since
-        # this may or may not be a mutation.
-        cache = cache.setdefault("backends", {}).setdefault("cugraph", {})
-        if val is None:
-            cache.pop(_CACHE_KEY, None)
-        else:
-            self.graph = val.graph
-            cache[_CACHE_KEY] = val
-            if clear_cpu:
-                for key in self._nx_attrs:
-                    self.__dict__[key] = None
-
-    @nx.Graph.name.setter
-    def name(self, s):
-        # Don't clear the cache when setting the name, since `.graph` is shared.
-        # There is a very small risk here for the cache to become (slightly)
-        # insconsistent if graphs from other backends are cached.
-        self.graph["name"] = s
-
-    @classmethod
-    @networkx_api
-    def is_directed(cls) -> bool:
-        return False
-
-    @classmethod
-    @networkx_api
-    def is_multigraph(cls) -> bool:
-        return False
-
-    @classmethod
-    def to_cudagraph_class(cls) -> type[CudaGraph]:
-        return CudaGraph
-
-    @classmethod
-    @networkx_api
-    def to_directed_class(cls) -> type[nxcg.DiGraph]:
-        return nxcg.DiGraph
-
-    @classmethod
-    def to_networkx_class(cls) -> type[nx.Graph]:
-        return nx.Graph
-
-    @classmethod
-    @networkx_api
-    def to_undirected_class(cls) -> type[Graph]:
-        return Graph
-
-    def __init__(self, incoming_graph_data=None, **attr):
-        super().__init__(incoming_graph_data, **attr)
-        self.__networkx_cache__ = _GraphCache(self)
-
-    def _reify_networkx(self) -> None:
-        """Copy graph to host (CPU) if necessary."""
-        if self.__dict__["_node"] is None:
-            # After we make this into an nx graph, we rely on the cache being correct
-            Gcg = self._cudagraph
-            G = nxcg.to_networkx(Gcg)
-            for key in self._nx_attrs:
-                self.__dict__[key] = G.__dict__[key]
-
-    def _become(self, other: Graph):
-        if self.__class__ is not other.__class__:
-            raise TypeError(
-                "Attempting to update graph inplace with graph of different type!"
-            )
-        # Begin with the simplest implementation; do we need to do more?
-        self.__dict__.update(other.__dict__)
-        return self
-
-    ####################
-    # Creation methods #
-    ####################
-
-    @classmethod
-    def from_coo(
-        cls,
-        N: int,
-        src_indices: cp.ndarray[IndexValue],
-        dst_indices: cp.ndarray[IndexValue],
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> Graph | CudaGraph:
-        new_graph = object.__new__(cls.to_cudagraph_class())
-        new_graph.__networkx_cache__ = {}
-        new_graph.src_indices = src_indices
-        new_graph.dst_indices = dst_indices
-        new_graph.edge_values = {} if edge_values is None else dict(edge_values)
-        new_graph.edge_masks = {} if edge_masks is None else dict(edge_masks)
-        new_graph.node_values = {} if node_values is None else dict(node_values)
-        new_graph.node_masks = {} if node_masks is None else dict(node_masks)
-        new_graph.key_to_id = None if key_to_id is None else dict(key_to_id)
-        new_graph._id_to_key = None if id_to_key is None else list(id_to_key)
-        new_graph._N = op.index(N)  # Ensure N is integral
-        new_graph._node_ids = None
-        new_graph.graph = new_graph.graph_attr_dict_factory()
-        new_graph.graph.update(attr)
-        size = new_graph.src_indices.size
-        # Easy and fast sanity checks
-        if size != new_graph.dst_indices.size:
-            raise ValueError
-        for edge_attr in ["edge_values", "edge_masks"]:
-            if datadict := getattr(new_graph, edge_attr):
-                for key, val in datadict.items():
-                    if val.shape[0] != size:
-                        raise ValueError(key)
-        for node_attr in ["node_values", "node_masks"]:
-            if datadict := getattr(new_graph, node_attr):
-                for key, val in datadict.items():
-                    if val.shape[0] != N:
-                        raise ValueError(key)
-        if new_graph.key_to_id is not None and len(new_graph.key_to_id) != N:
-            raise ValueError
-        if new_graph._id_to_key is not None and len(new_graph._id_to_key) != N:
-            raise ValueError
-        if new_graph._id_to_key is not None and new_graph.key_to_id is None:
-            try:
-                new_graph.key_to_id = dict(zip(new_graph._id_to_key, range(N)))
-            except TypeError as exc:
-                raise ValueError("Bad type of a node value") from exc
-        if new_graph.src_indices.dtype != index_dtype:
-            src_indices = new_graph.src_indices.astype(index_dtype)
-            if not (new_graph.src_indices == src_indices).all():
-                raise ValueError(
-                    f"Unable to convert src_indices to {src_indices.dtype.name} "
-                    f"(got {new_graph.src_indices.dtype.name})."
-                )
-            new_graph.src_indices = src_indices
-        if new_graph.dst_indices.dtype != index_dtype:
-            dst_indices = new_graph.dst_indices.astype(index_dtype)
-            if not (new_graph.dst_indices == dst_indices).all():
-                raise ValueError(
-                    f"Unable to convert dst_indices to {dst_indices.dtype.name} "
-                    f"(got {new_graph.dst_indices.dtype.name})."
-                )
-            new_graph.dst_indices = dst_indices
-
-        # If the graph contains isolates, plc.SGGraph() must be passed a value
-        # for vertices_array that contains every vertex ID, since the
-        # src/dst_indices arrays will not contain IDs for isolates. Create this
-        # only if needed. Like src/dst_indices, the _node_ids array must be
-        # maintained for the lifetime of the plc.SGGraph
-        isolates = nxcg.algorithms.isolate._isolates(new_graph)
-        if len(isolates) > 0:
-            new_graph._node_ids = cp.arange(new_graph._N, dtype=index_dtype)
-        if use_compat_graph or use_compat_graph is None and issubclass(cls, Graph):
-            new_graph = new_graph._to_compat_graph()
-        return new_graph
-
-    @classmethod
-    def from_csr(
-        cls,
-        indptr: cp.ndarray[IndexValue],
-        dst_indices: cp.ndarray[IndexValue],
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> Graph | CudaGraph:
-        N = indptr.size - 1
-        src_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(np.arange(N, dtype=index_dtype), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-    @classmethod
-    def from_csc(
-        cls,
-        indptr: cp.ndarray[IndexValue],
-        src_indices: cp.ndarray[IndexValue],
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> Graph | CudaGraph:
-        N = indptr.size - 1
-        dst_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(np.arange(N, dtype=index_dtype), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-    @classmethod
-    def from_dcsr(
-        cls,
-        N: int,
-        compressed_srcs: cp.ndarray[IndexValue],
-        indptr: cp.ndarray[IndexValue],
-        dst_indices: cp.ndarray[IndexValue],
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> Graph | CudaGraph:
-        src_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(compressed_srcs.get(), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-    @classmethod
-    def from_dcsc(
-        cls,
-        N: int,
-        compressed_dsts: cp.ndarray[IndexValue],
-        indptr: cp.ndarray[IndexValue],
-        src_indices: cp.ndarray[IndexValue],
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> Graph | CudaGraph:
-        dst_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(compressed_dsts.get(), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-
-class CudaGraph:
-    # Tell networkx to dispatch calls with this object to nx-cugraph
-    __networkx_backend__: ClassVar[str] = "cugraph"  # nx >=3.2
-    __networkx_plugin__: ClassVar[str] = "cugraph"  # nx <3.2
-
-    # Allow networkx dispatch machinery to cache conversions.
-    # This means we should clear the cache if we ever mutate the object!
-    __networkx_cache__: dict | None
-
-    # networkx properties
-    graph: dict
-    graph_attr_dict_factory: ClassVar[type] = dict
-
-    # Not networkx properties
-    # We store edge data in COO format with {src,dst}_indices and edge_values.
-    src_indices: cp.ndarray[IndexValue]
-    dst_indices: cp.ndarray[IndexValue]
-    edge_values: dict[AttrKey, cp.ndarray[EdgeValue]]
-    edge_masks: dict[AttrKey, cp.ndarray[bool]]
-    node_values: dict[AttrKey, any_ndarray[NodeValue]]
-    node_masks: dict[AttrKey, any_ndarray[bool]]
-    key_to_id: dict[NodeKey, IndexValue] | None
-    _id_to_key: list[NodeKey] | None
-    _N: int
-    _node_ids: cp.ndarray[IndexValue] | None  # holds plc.SGGraph.vertices_array data
-
-    # Used by graph._get_plc_graph
-    _plc_type_map: ClassVar[dict[np.dtype, np.dtype]] = {
-        # signed int
-        np.dtype(np.int8): np.dtype(np.float32),
-        np.dtype(np.int16): np.dtype(np.float32),
-        np.dtype(np.int32): np.dtype(np.float64),
-        np.dtype(np.int64): np.dtype(np.float64),  # raise if abs(x) > 2**53
-        # unsigned int
-        np.dtype(np.uint8): np.dtype(np.float32),
-        np.dtype(np.uint16): np.dtype(np.float32),
-        np.dtype(np.uint32): np.dtype(np.float64),
-        np.dtype(np.uint64): np.dtype(np.float64),  # raise if x > 2**53
-        # other
-        np.dtype(np.bool_): np.dtype(np.float32),
-        np.dtype(np.float16): np.dtype(np.float32),
-    }
-    _plc_allowed_edge_types: ClassVar[set[np.dtype]] = {
-        np.dtype(np.float32),
-        np.dtype(np.float64),
-    }
-
-    ####################
-    # Creation methods #
-    ####################
-
-    from_coo = classmethod(Graph.from_coo.__func__)
-    from_csr = classmethod(Graph.from_csr.__func__)
-    from_csc = classmethod(Graph.from_csc.__func__)
-    from_dcsr = classmethod(Graph.from_dcsr.__func__)
-    from_dcsc = classmethod(Graph.from_dcsc.__func__)
-
-    def __new__(cls, incoming_graph_data=None, **attr) -> CudaGraph:
-        if incoming_graph_data is None:
-            new_graph = cls.from_coo(
-                0,
-                cp.empty(0, index_dtype),
-                cp.empty(0, index_dtype),
-                use_compat_graph=False,
-            )
-        elif incoming_graph_data.__class__ is cls:
-            new_graph = incoming_graph_data.copy()
-        elif incoming_graph_data.__class__ is cls.to_networkx_class():
-            new_graph = nxcg.from_networkx(incoming_graph_data, preserve_all_attrs=True)
-        else:
-            raise NotImplementedError
-        new_graph.graph.update(attr)
-        # We could return Graph here (if configured), but let's not for now
-        return new_graph
-
-    #################
-    # Class methods #
-    #################
-
-    is_directed = classmethod(Graph.is_directed.__func__)
-    is_multigraph = classmethod(Graph.is_multigraph.__func__)
-    to_cudagraph_class = classmethod(Graph.to_cudagraph_class.__func__)
-    to_networkx_class = classmethod(Graph.to_networkx_class.__func__)
-
-    @classmethod
-    @networkx_api
-    def to_directed_class(cls) -> type[nxcg.CudaDiGraph]:
-        return nxcg.CudaDiGraph
-
-    @classmethod
-    @networkx_api
-    def to_undirected_class(cls) -> type[CudaGraph]:
-        return CudaGraph
-
-    @classmethod
-    def _to_compat_graph_class(cls) -> type[Graph]:
-        return Graph
-
-    ##############
-    # Properties #
-    ##############
-
-    @property
-    def edge_dtypes(self) -> dict[AttrKey, Dtype]:
-        return {key: val.dtype for key, val in self.edge_values.items()}
-
-    @property
-    def node_dtypes(self) -> dict[AttrKey, Dtype]:
-        return {key: val.dtype for key, val in self.node_values.items()}
-
-    @property
-    def id_to_key(self) -> [NodeKey] | None:
-        if self.key_to_id is None:
-            return None
-        if self._id_to_key is None:
-            self._id_to_key = sorted(self.key_to_id, key=self.key_to_id.__getitem__)
-        return self._id_to_key
-
-    name = nx.Graph.name
-
-    ##################
-    # Dunder methods #
-    ##################
-
-    @networkx_api
-    def __contains__(self, n: NodeKey) -> bool:
-        if self.key_to_id is not None:
-            container = self.key_to_id
-        else:
-            container = range(self._N)
-        try:
-            return n in container
-        except TypeError:
-            return False
-
-    @networkx_api
-    def __iter__(self) -> Iterator[NodeKey]:
-        if self.key_to_id is not None:
-            return iter(self.key_to_id)
-        return iter(range(self._N))
-
-    @networkx_api
-    def __len__(self) -> int:
-        return self._N
-
-    __str__ = nx.Graph.__str__
-
-    ##########################
-    # NetworkX graph methods #
-    ##########################
-
-    @networkx_api
-    def add_nodes_from(self, nodes_for_adding: Iterable[NodeKey], **attr) -> None:
-        if self._N != 0:
-            raise NotImplementedError(
-                "add_nodes_from is not implemented for graph that already has nodes."
-            )
-        G = self.to_networkx_class()()
-        G.add_nodes_from(nodes_for_adding, **attr)
-        G = nxcg.from_networkx(G, preserve_node_attrs=True)
-        self._become(G)
-
-    @networkx_api
-    def clear(self) -> None:
-        self.edge_values.clear()
-        self.edge_masks.clear()
-        self.node_values.clear()
-        self.node_masks.clear()
-        self.graph.clear()
-        self.src_indices = cp.empty(0, self.src_indices.dtype)
-        self.dst_indices = cp.empty(0, self.dst_indices.dtype)
-        self._N = 0
-        self._node_ids = None
-        self.key_to_id = None
-        self._id_to_key = None
-        if cache := self.__networkx_cache__:
-            cache.clear()
-
-    @networkx_api
-    def clear_edges(self) -> None:
-        self.edge_values.clear()
-        self.edge_masks.clear()
-        self.src_indices = cp.empty(0, self.src_indices.dtype)
-        self.dst_indices = cp.empty(0, self.dst_indices.dtype)
-        if cache := self.__networkx_cache__:
-            cache.clear()
-
-    @networkx_api
-    def copy(self, as_view: bool = False) -> CudaGraph:
-        # Does shallow copy in networkx
-        return self._copy(as_view, self.__class__)
-
-    @networkx_api
-    def get_edge_data(
-        self, u: NodeKey, v: NodeKey, default: EdgeValue | None = None
-    ) -> dict[AttrKey, EdgeValue]:
-        if self.key_to_id is not None:
-            try:
-                u = self.key_to_id[u]
-                v = self.key_to_id[v]
-            except KeyError:
-                return default
-        else:
-            try:
-                if u < 0 or v < 0 or u >= self._N or v >= self._N:
-                    return default
-            except TypeError:
-                return default
-        index = cp.nonzero((self.src_indices == u) & (self.dst_indices == v))[0]
-        if index.size == 0:
-            return default
-        [index] = index.tolist()
-        if not self.edge_values:
-            return {}
-        return {
-            key: val[index].tolist()
-            for key, val in self.edge_values.items()
-            if key not in self.edge_masks or self.edge_masks[key][index]
-        }
-
-    @networkx_api
-    def has_edge(self, u: NodeKey, v: NodeKey) -> bool:
-        if self.key_to_id is not None:
-            try:
-                u = self.key_to_id[u]
-                v = self.key_to_id[v]
-            except KeyError:
-                return False
-        return bool(((self.src_indices == u) & (self.dst_indices == v)).any())
-
-    def _neighbors(self, n: NodeKey) -> cp.ndarray[NodeValue]:
-        if n not in self:
-            hash(n)  # To raise TypeError if appropriate
-            raise nx.NetworkXError(
-                f"The node {n} is not in the {self.__class__.__name__.lower()}."
-            )
-        if self.key_to_id is not None:
-            n = self.key_to_id[n]
-        nbrs = self.dst_indices[self.src_indices == n]
-        if self.is_multigraph():
-            nbrs = cp.unique(nbrs)
-        return nbrs
-
-    @networkx_api
-    def neighbors(self, n: NodeKey) -> Iterator[NodeKey]:
-        nbrs = self._neighbors(n)
-        return iter(self._nodeiter_to_iter(nbrs.tolist()))
-
-    @networkx_api
-    def has_node(self, n: NodeKey) -> bool:
-        return n in self
-
-    @networkx_api
-    def nbunch_iter(self, nbunch=None) -> Iterator[NodeKey]:
-        if nbunch is None:
-            return iter(self)
-        if nbunch in self:
-            return iter([nbunch])
-        return (node for node in nbunch if node in self)
-
-    @networkx_api
-    def number_of_edges(
-        self, u: NodeKey | None = None, v: NodeKey | None = None
-    ) -> int:
-        if u is not None or v is not None:
-            raise NotImplementedError
-        return self.size()
-
-    @networkx_api
-    def number_of_nodes(self) -> int:
-        return self._N
-
-    @networkx_api
-    def order(self) -> int:
-        return self._N
-
-    @networkx_api
-    def size(self, weight: AttrKey | None = None) -> int:
-        if weight is not None:
-            raise NotImplementedError
-        # If no self-edges, then `self.src_indices.size // 2`
-        return int(cp.count_nonzero(self.src_indices <= self.dst_indices))
-
-    @networkx_api
-    def to_directed(self, as_view: bool = False) -> nxcg.CudaDiGraph:
-        return self._copy(as_view, self.to_directed_class())
-
-    @networkx_api
-    def to_undirected(self, as_view: bool = False) -> CudaGraph:
-        # Does deep copy in networkx
-        return self._copy(as_view, self.to_undirected_class())
-
-    def _to_compat_graph(self) -> Graph:
-        rv = self._to_compat_graph_class()()
-        rv._cudagraph = self
-        return rv
-
-    # Not implemented...
-    # adj, adjacency, add_edge, add_edges_from, add_node,
-    # add_nodes_from, add_weighted_edges_from, degree,
-    # edge_subgraph, edges, neighbors, nodes, remove_edge,
-    # remove_edges_from, remove_node, remove_nodes_from, subgraph, update
-
-    ###################
-    # Private methods #
-    ###################
-
-    def _copy(self, as_view: bool, cls: type[CudaGraph], reverse: bool = False):
-        # DRY warning: see also CudaMultiGraph._copy
-        src_indices = self.src_indices
-        dst_indices = self.dst_indices
-        edge_values = self.edge_values
-        edge_masks = self.edge_masks
-        node_values = self.node_values
-        node_masks = self.node_masks
-        key_to_id = self.key_to_id
-        id_to_key = None if key_to_id is None else self._id_to_key
-        if self.__networkx_cache__ is None:
-            __networkx_cache__ = None
-        elif not reverse and cls is self.__class__:
-            __networkx_cache__ = self.__networkx_cache__
-        else:
-            __networkx_cache__ = {}
-        if not as_view:
-            src_indices = src_indices.copy()
-            dst_indices = dst_indices.copy()
-            edge_values = {key: val.copy() for key, val in edge_values.items()}
-            edge_masks = {key: val.copy() for key, val in edge_masks.items()}
-            node_values = {key: val.copy() for key, val in node_values.items()}
-            node_masks = {key: val.copy() for key, val in node_masks.items()}
-            if key_to_id is not None:
-                key_to_id = key_to_id.copy()
-                if id_to_key is not None:
-                    id_to_key = id_to_key.copy()
-            if __networkx_cache__ is not None:
-                __networkx_cache__ = __networkx_cache__.copy()
-        if reverse:
-            src_indices, dst_indices = dst_indices, src_indices
-        rv = cls.from_coo(
-            self._N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            use_compat_graph=False,
-        )
-        if as_view:
-            rv.graph = self.graph
-        else:
-            rv.graph.update(deepcopy(self.graph))
-        rv.__networkx_cache__ = __networkx_cache__
-        return rv
-
-    def _get_plc_graph(
-        self,
-        edge_attr: AttrKey | None = None,
-        edge_default: EdgeValue | None = None,
-        edge_dtype: Dtype | None = None,
-        *,
-        store_transposed: bool = False,
-        switch_indices: bool = False,
-        edge_array: cp.ndarray[EdgeValue] | None = None,
-        symmetrize: str | None = None,
-    ):
-        if edge_array is not None or edge_attr is None:
-            pass
-        elif edge_attr not in self.edge_values:
-            if edge_default is None:
-                raise KeyError("Graph has no edge attribute {edge_attr!r}")
-            # If we were given a default edge value, then it's probably okay to
-            # use None for the edge_array if we don't have this edge attribute.
-        elif edge_attr not in self.edge_masks:
-            edge_array = self.edge_values[edge_attr]
-        elif not self.edge_masks[edge_attr].all():
-            if edge_default is None:
-                raise NotImplementedError(
-                    "Missing edge attributes is not yet implemented"
-                )
-            edge_array = cp.where(
-                self.edge_masks[edge_attr], self.edge_values[edge_attr], edge_default
-            )
-        else:
-            # Mask is all True; don't need anymore
-            del self.edge_masks[edge_attr]
-            edge_array = self.edge_values[edge_attr]
-        if edge_array is not None:
-            if edge_dtype is not None:
-                edge_dtype = np.dtype(edge_dtype)
-                if edge_array.dtype != edge_dtype:
-                    edge_array = edge_array.astype(edge_dtype)
-            # PLC doesn't handle int edge weights right now, so cast int to float
-            if edge_array.dtype in self._plc_type_map:
-                if edge_array.dtype == np.int64:
-                    if (val := edge_array.max().tolist()) > 2**53:
-                        raise ValueError(
-                            f"Integer value of value is too large (> 2**53): {val}; "
-                            "pylibcugraph only supports float16 and float32 dtypes."
-                        )
-                    if (val := edge_array.min().tolist()) < -(2**53):
-                        raise ValueError(
-                            f"Integer value of value is small large (< -2**53): {val}; "
-                            "pylibcugraph only supports float16 and float32 dtypes."
-                        )
-                elif (
-                    edge_array.dtype == np.uint64 and edge_array.max().tolist() > 2**53
-                ):
-                    raise ValueError(
-                        f"Integer value of value is too large (> 2**53): {val}; "
-                        "pylibcugraph only supports float16 and float32 dtypes."
-                    )
-                # Consider warning here if we add algorithms that may
-                # introduce roundoff errors when using floats as ints.
-                edge_array = edge_array.astype(self._plc_type_map[edge_array.dtype])
-            elif edge_array.dtype not in self._plc_allowed_edge_types:
-                raise TypeError(edge_array.dtype)
-        # Should we cache PLC graph?
-        src_indices = self.src_indices
-        dst_indices = self.dst_indices
-        if switch_indices:
-            src_indices, dst_indices = dst_indices, src_indices
-        if symmetrize is not None:
-            if edge_array is not None:
-                raise NotImplementedError(
-                    "edge_array must be None when symmetrizing the graph"
-                )
-            N = self._N
-            # Upcast to int64 so indices don't overflow
-            src_dst = N * src_indices.astype(np.int64) + dst_indices
-            src_dst_T = src_indices + N * dst_indices.astype(np.int64)
-            if symmetrize == "union":
-                src_dst_new = cp.union1d(src_dst, src_dst_T)
-            elif symmetrize == "intersection":
-                src_dst_new = cp.intersect1d(src_dst, src_dst_T)
-            else:
-                raise ValueError(
-                    f'symmetrize must be "union" or "intersection"; got "{symmetrize}"'
-                )
-            src_indices, dst_indices = cp.divmod(src_dst_new, N)
-            src_indices = src_indices.astype(index_dtype)
-            dst_indices = dst_indices.astype(index_dtype)
-
-        # This sets drop_multi_edges=True for non-multigraph input, which means
-        # the data in self.src_indices and self.dst_indices may not be
-        # identical to that contained in the returned pcl.SGGraph (the returned
-        # SGGraph may have fewer edges since duplicates are dropped). Ideally
-        # self.src_indices and self.dst_indices would be updated to have
-        # duplicate edges removed for non-multigraph instances, but that
-        # requires additional code which would be redundant and likely not as
-        # performant as the code in PLC.
-        return plc.SGGraph(
-            resource_handle=plc.ResourceHandle(),
-            graph_properties=plc.GraphProperties(
-                is_multigraph=self.is_multigraph() and symmetrize is None,
-                is_symmetric=not self.is_directed() or symmetrize is not None,
-            ),
-            src_or_offset_array=src_indices,
-            dst_or_index_array=dst_indices,
-            weight_array=edge_array,
-            store_transposed=store_transposed,
-            renumber=False,
-            do_expensive_check=False,
-            vertices_array=self._node_ids,
-            drop_multi_edges=not self.is_multigraph(),
-        )
-
-    def _sort_edge_indices(self, primary="src"):
-        # DRY warning: see also CudaMultiGraph._sort_edge_indices
-        if primary == "src":
-            stacked = cp.vstack((self.dst_indices, self.src_indices))
-        elif primary == "dst":
-            stacked = cp.vstack((self.src_indices, self.dst_indices))
-        else:
-            raise ValueError(
-                f'Bad `primary` argument; expected "src" or "dst", got {primary!r}'
-            )
-        indices = cp.lexsort(stacked)
-        if (cp.diff(indices) > 0).all():
-            # Already sorted
-            return
-        self.src_indices = self.src_indices[indices]
-        self.dst_indices = self.dst_indices[indices]
-        self.edge_values.update(
-            {key: val[indices] for key, val in self.edge_values.items()}
-        )
-        self.edge_masks.update(
-            {key: val[indices] for key, val in self.edge_masks.items()}
-        )
-
-    def _become(self, other: CudaGraph):
-        if self.__class__ is not other.__class__:
-            raise TypeError(
-                "Attempting to update graph inplace with graph of different type!"
-            )
-        self.clear()
-        edge_values = self.edge_values
-        edge_masks = self.edge_masks
-        node_values = self.node_values
-        node_masks = self.node_masks
-        __networkx_cache__ = self.__networkx_cache__
-        graph = self.graph
-        edge_values.update(other.edge_values)
-        edge_masks.update(other.edge_masks)
-        node_values.update(other.node_values)
-        node_masks.update(other.node_masks)
-        graph.update(other.graph)
-        if other.__networkx_cache__ is None:
-            __networkx_cache__ = None
-        else:
-            if __networkx_cache__ is None:
-                __networkx_cache__ = {}
-            __networkx_cache__.update(other.__networkx_cache__)
-        self.__dict__.update(other.__dict__)
-        self.edge_values = edge_values
-        self.edge_masks = edge_masks
-        self.node_values = node_values
-        self.node_masks = node_masks
-        self.graph = graph
-        self.__networkx_cache__ = __networkx_cache__
-        return self
-
-    def _degrees_array(self, *, ignore_selfloops=False):
-        src_indices = self.src_indices
-        dst_indices = self.dst_indices
-        if ignore_selfloops:
-            not_selfloops = src_indices != dst_indices
-            src_indices = src_indices[not_selfloops]
-            if self.is_directed():
-                dst_indices = dst_indices[not_selfloops]
-        if src_indices.size == 0:
-            return cp.zeros(self._N, dtype=np.int64)
-        degrees = cp.bincount(src_indices, minlength=self._N)
-        if self.is_directed():
-            degrees += cp.bincount(dst_indices, minlength=self._N)
-        return degrees
-
-    _in_degrees_array = _degrees_array
-    _out_degrees_array = _degrees_array
-
-    # Data conversions
-    def _nodekeys_to_nodearray(self, nodes: Iterable[NodeKey]) -> cp.array[IndexValue]:
-        if self.key_to_id is None:
-            return cp.fromiter(nodes, dtype=index_dtype)
-        return cp.fromiter(map(self.key_to_id.__getitem__, nodes), dtype=index_dtype)
-
-    def _nodeiter_to_iter(self, node_ids: Iterable[IndexValue]) -> Iterable[NodeKey]:
-        """Convert an iterable of node IDs to an iterable of node keys."""
-        if (id_to_key := self.id_to_key) is not None:
-            return map(id_to_key.__getitem__, node_ids)
-        return node_ids
-
-    def _nodearray_to_list(self, node_ids: cp.ndarray[IndexValue]) -> list[NodeKey]:
-        if self.key_to_id is None:
-            return node_ids.tolist()
-        return list(self._nodeiter_to_iter(node_ids.tolist()))
-
-    def _list_to_nodearray(self, nodes: list[NodeKey]) -> cp.ndarray[IndexValue]:
-        if (key_to_id := self.key_to_id) is not None:
-            nodes = [key_to_id[node] for node in nodes]
-        return cp.array(nodes, dtype=index_dtype)
-
-    def _nodearray_to_set(self, node_ids: cp.ndarray[IndexValue]) -> set[NodeKey]:
-        if self.key_to_id is None:
-            return set(node_ids.tolist())
-        return set(self._nodeiter_to_iter(node_ids.tolist()))
-
-    def _nodearray_to_dict(
-        self, values: cp.ndarray[NodeValue]
-    ) -> dict[NodeKey, NodeValue]:
-        it = enumerate(values.tolist())
-        if (id_to_key := self.id_to_key) is not None:
-            return {id_to_key[key]: val for key, val in it}
-        return dict(it)
-
-    def _nodearrays_to_dict(
-        self, node_ids: cp.ndarray[IndexValue], values: any_ndarray[NodeValue]
-    ) -> dict[NodeKey, NodeValue]:
-        it = zip(node_ids.tolist(), values.tolist())
-        if (id_to_key := self.id_to_key) is not None:
-            return {id_to_key[key]: val for key, val in it}
-        return dict(it)
-
-    def _edgearrays_to_dict(
-        self,
-        src_ids: cp.ndarray[IndexValue],
-        dst_ids: cp.ndarray[IndexValue],
-        values: cp.ndarray[EdgeValue],
-    ) -> dict[EdgeTuple, EdgeValue]:
-        it = zip(zip(src_ids.tolist(), dst_ids.tolist()), values.tolist())
-        if (id_to_key := self.id_to_key) is not None:
-            return {
-                (id_to_key[src_id], id_to_key[dst_id]): val
-                for (src_id, dst_id), val in it
-            }
-        return dict(it)
-
-    def _dict_to_nodearrays(
-        self,
-        d: dict[NodeKey, NodeValue],
-        dtype: Dtype | None = None,
-    ) -> tuple[cp.ndarray[IndexValue], cp.ndarray[NodeValue]]:
-        if self.key_to_id is None:
-            indices_iter = d
-        else:
-            indices_iter = map(self.key_to_id.__getitem__, d)
-        node_ids = cp.fromiter(indices_iter, index_dtype)
-        if dtype is None:
-            values = cp.array(list(d.values()))
-        else:
-            values = cp.fromiter(d.values(), dtype)
-        return node_ids, values
-
-    def _dict_to_nodearray(
-        self,
-        d: dict[NodeKey, NodeValue] | cp.ndarray[NodeValue],
-        default: NodeValue | None = None,
-        dtype: Dtype | None = None,
-    ) -> cp.ndarray[NodeValue]:
-        if isinstance(d, cp.ndarray):
-            if d.shape[0] != len(self):
-                raise ValueError
-            if dtype is not None and d.dtype != dtype:
-                return d.astype(dtype)
-            return d
-        if default is None:
-            val_iter = map(d.__getitem__, self)
-        else:
-            val_iter = (d.get(node, default) for node in self)
-        if dtype is None:
-            return cp.array(list(val_iter))
-        return cp.fromiter(val_iter, dtype)
diff --git a/python/nx-cugraph/nx_cugraph/classes/multidigraph.py b/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
deleted file mode 100644
index 5a6595567d2..00000000000
--- a/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import networkx as nx
-
-import nx_cugraph as nxcg
-
-from .digraph import CudaDiGraph, DiGraph
-from .graph import Graph
-from .multigraph import CudaMultiGraph, MultiGraph
-
-__all__ = ["CudaMultiDiGraph", "MultiDiGraph"]
-
-networkx_api = nxcg.utils.decorators.networkx_class(nx.MultiDiGraph)
-
-
-class MultiDiGraph(nx.MultiDiGraph, MultiGraph, DiGraph):
-    name = Graph.name
-    _node = Graph._node
-    _adj = DiGraph._adj
-    _succ = DiGraph._succ
-    _pred = DiGraph._pred
-
-    @classmethod
-    @networkx_api
-    def is_directed(cls) -> bool:
-        return True
-
-    @classmethod
-    @networkx_api
-    def is_multigraph(cls) -> bool:
-        return True
-
-    @classmethod
-    def to_cudagraph_class(cls) -> type[CudaMultiDiGraph]:
-        return CudaMultiDiGraph
-
-    @classmethod
-    def to_networkx_class(cls) -> type[nx.MultiDiGraph]:
-        return nx.MultiDiGraph
-
-
-class CudaMultiDiGraph(CudaMultiGraph, CudaDiGraph):
-    is_directed = classmethod(MultiDiGraph.is_directed.__func__)
-    is_multigraph = classmethod(MultiDiGraph.is_multigraph.__func__)
-    to_cudagraph_class = classmethod(MultiDiGraph.to_cudagraph_class.__func__)
-    to_networkx_class = classmethod(MultiDiGraph.to_networkx_class.__func__)
-
-    @classmethod
-    def _to_compat_graph_class(cls) -> type[MultiDiGraph]:
-        return MultiDiGraph
-
-    ##########################
-    # NetworkX graph methods #
-    ##########################
-
-    @networkx_api
-    def to_undirected(self, reciprocal=False, as_view=False):
-        raise NotImplementedError
diff --git a/python/nx-cugraph/nx_cugraph/classes/multigraph.py b/python/nx-cugraph/nx_cugraph/classes/multigraph.py
deleted file mode 100644
index c8c8f1dfb00..00000000000
--- a/python/nx-cugraph/nx_cugraph/classes/multigraph.py
+++ /dev/null
@@ -1,552 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-from copy import deepcopy
-from typing import TYPE_CHECKING, ClassVar
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-import nx_cugraph as nxcg
-
-from ..utils import index_dtype
-from .graph import CudaGraph, Graph, _GraphCache
-
-if TYPE_CHECKING:
-    from nx_cugraph.typing import (
-        AttrKey,
-        EdgeKey,
-        EdgeValue,
-        IndexValue,
-        NodeKey,
-        NodeValue,
-        any_ndarray,
-    )
-__all__ = ["MultiGraph", "CudaMultiGraph"]
-
-networkx_api = nxcg.utils.decorators.networkx_class(nx.MultiGraph)
-
-
-class MultiGraph(nx.MultiGraph, Graph):
-    name = Graph.name
-    _node = Graph._node
-    _adj = Graph._adj
-
-    @classmethod
-    @networkx_api
-    def is_directed(cls) -> bool:
-        return False
-
-    @classmethod
-    @networkx_api
-    def is_multigraph(cls) -> bool:
-        return True
-
-    @classmethod
-    def to_cudagraph_class(cls) -> type[CudaMultiGraph]:
-        return CudaMultiGraph
-
-    @classmethod
-    @networkx_api
-    def to_directed_class(cls) -> type[nxcg.MultiDiGraph]:
-        return nxcg.MultiDiGraph
-
-    @classmethod
-    def to_networkx_class(cls) -> type[nx.MultiGraph]:
-        return nx.MultiGraph
-
-    @classmethod
-    @networkx_api
-    def to_undirected_class(cls) -> type[MultiGraph]:
-        return MultiGraph
-
-    def __init__(self, incoming_graph_data=None, multigraph_input=None, **attr):
-        super().__init__(incoming_graph_data, multigraph_input, **attr)
-        self.__networkx_cache__ = _GraphCache(self)
-
-    ####################
-    # Creation methods #
-    ####################
-
-    @classmethod
-    def from_coo(
-        cls,
-        N: int,
-        src_indices: cp.ndarray[IndexValue],
-        dst_indices: cp.ndarray[IndexValue],
-        edge_indices: cp.ndarray[IndexValue] | None = None,
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        edge_keys: list[EdgeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> MultiGraph | CudaMultiGraph:
-        new_graph = super(cls.to_undirected_class(), cls).from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            use_compat_graph=False,
-            **attr,
-        )
-        new_graph.edge_indices = edge_indices
-        new_graph.edge_keys = edge_keys
-        # Easy and fast sanity checks
-        if (
-            new_graph.edge_keys is not None
-            and len(new_graph.edge_keys) != src_indices.size
-        ):
-            raise ValueError
-        if use_compat_graph or use_compat_graph is None and issubclass(cls, Graph):
-            new_graph = new_graph._to_compat_graph()
-        return new_graph
-
-    @classmethod
-    def from_csr(
-        cls,
-        indptr: cp.ndarray[IndexValue],
-        dst_indices: cp.ndarray[IndexValue],
-        edge_indices: cp.ndarray[IndexValue] | None = None,
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        edge_keys: list[EdgeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> MultiGraph | CudaMultiGraph:
-        N = indptr.size - 1
-        src_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(np.arange(N, dtype=index_dtype), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            edge_keys=edge_keys,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-    @classmethod
-    def from_csc(
-        cls,
-        indptr: cp.ndarray[IndexValue],
-        src_indices: cp.ndarray[IndexValue],
-        edge_indices: cp.ndarray[IndexValue] | None = None,
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        edge_keys: list[EdgeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> MultiGraph | CudaMultiGraph:
-        N = indptr.size - 1
-        dst_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(np.arange(N, dtype=index_dtype), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            edge_keys=edge_keys,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-    @classmethod
-    def from_dcsr(
-        cls,
-        N: int,
-        compressed_srcs: cp.ndarray[IndexValue],
-        indptr: cp.ndarray[IndexValue],
-        dst_indices: cp.ndarray[IndexValue],
-        edge_indices: cp.ndarray[IndexValue] | None = None,
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        edge_keys: list[EdgeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> MultiGraph | CudaMultiGraph:
-        src_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(compressed_srcs.get(), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            edge_keys=edge_keys,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-    @classmethod
-    def from_dcsc(
-        cls,
-        N: int,
-        compressed_dsts: cp.ndarray[IndexValue],
-        indptr: cp.ndarray[IndexValue],
-        src_indices: cp.ndarray[IndexValue],
-        edge_indices: cp.ndarray[IndexValue] | None = None,
-        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
-        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
-        node_values: dict[AttrKey, any_ndarray[NodeValue]] | None = None,
-        node_masks: dict[AttrKey, any_ndarray[bool]] | None = None,
-        *,
-        key_to_id: dict[NodeKey, IndexValue] | None = None,
-        id_to_key: list[NodeKey] | None = None,
-        edge_keys: list[EdgeKey] | None = None,
-        use_compat_graph: bool | None = None,
-        **attr,
-    ) -> MultiGraph | CudaGraph:
-        dst_indices = cp.array(
-            # cp.repeat is slow to use here, so use numpy instead
-            np.repeat(compressed_dsts.get(), cp.diff(indptr).get())
-        )
-        return cls.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            edge_keys=edge_keys,
-            use_compat_graph=use_compat_graph,
-            **attr,
-        )
-
-
-class CudaMultiGraph(CudaGraph):
-    # networkx properties
-    edge_key_dict_factory: ClassVar[type] = dict
-
-    # Not networkx properties
-
-    # In a MultiGraph, each edge has a unique `(src, dst, key)` key.
-    # By default, `key` is 0 if possible, else 1, else 2, etc.
-    # This key can be any hashable Python object in NetworkX.
-    # We don't use a dict for our data structure here, because
-    # that would require a `(src, dst, key)` key.
-    # Instead, we keep `edge_keys` and/or `edge_indices`.
-    # `edge_keys` is the list of Python objects for each edge.
-    # `edge_indices` is for the common case of default multiedge keys,
-    # in which case we can store it as a cupy array.
-    # `edge_indices` is generally preferred. It is possible to provide
-    # both where edge_indices is the default and edge_keys is anything.
-    # It is also possible for them both to be None, which means the
-    # default edge indices has not yet been calculated.
-    edge_indices: cp.ndarray[IndexValue] | None
-    edge_keys: list[EdgeKey] | None
-
-    ####################
-    # Creation methods #
-    ####################
-
-    from_coo = classmethod(MultiGraph.from_coo.__func__)
-    from_csr = classmethod(MultiGraph.from_csr.__func__)
-    from_csc = classmethod(MultiGraph.from_csc.__func__)
-    from_dcsr = classmethod(MultiGraph.from_dcsr.__func__)
-    from_dcsc = classmethod(MultiGraph.from_dcsc.__func__)
-
-    def __new__(
-        cls, incoming_graph_data=None, multigraph_input=None, **attr
-    ) -> CudaMultiGraph:
-        if isinstance(incoming_graph_data, dict) and multigraph_input is not False:
-            new_graph = nxcg.from_networkx(
-                nx.MultiGraph(incoming_graph_data, multigraph_input=multigraph_input),
-                preserve_all_attrs=True,
-            )
-        else:
-            new_graph = super().__new__(cls, incoming_graph_data)
-        new_graph.graph.update(attr)
-        return new_graph
-
-    #################
-    # Class methods #
-    #################
-
-    is_directed = classmethod(MultiGraph.is_directed.__func__)
-    is_multigraph = classmethod(MultiGraph.is_multigraph.__func__)
-    to_cudagraph_class = classmethod(MultiGraph.to_cudagraph_class.__func__)
-    to_networkx_class = classmethod(MultiGraph.to_networkx_class.__func__)
-
-    @classmethod
-    @networkx_api
-    def to_directed_class(cls) -> type[nxcg.CudaMultiDiGraph]:
-        return nxcg.CudaMultiDiGraph
-
-    @classmethod
-    @networkx_api
-    def to_undirected_class(cls) -> type[CudaMultiGraph]:
-        return CudaMultiGraph
-
-    @classmethod
-    def _to_compat_graph_class(cls) -> type[MultiGraph]:
-        return MultiGraph
-
-    ##########################
-    # NetworkX graph methods #
-    ##########################
-
-    @networkx_api
-    def clear(self) -> None:
-        super().clear()
-        self.edge_indices = None
-        self.edge_keys = None
-
-    @networkx_api
-    def clear_edges(self) -> None:
-        super().clear_edges()
-        self.edge_indices = None
-        self.edge_keys = None
-
-    @networkx_api
-    def copy(self, as_view: bool = False) -> CudaMultiGraph:
-        # Does shallow copy in networkx
-        return self._copy(as_view, self.__class__)
-
-    @networkx_api
-    def get_edge_data(
-        self,
-        u: NodeKey,
-        v: NodeKey,
-        key: EdgeKey | None = None,
-        default: EdgeValue | None = None,
-    ):
-        if self.key_to_id is not None:
-            try:
-                u = self.key_to_id[u]
-                v = self.key_to_id[v]
-            except KeyError:
-                return default
-        else:
-            try:
-                if u < 0 or v < 0 or u >= self._N or v >= self._N:
-                    return default
-            except TypeError:
-                return default
-        mask = (self.src_indices == u) & (self.dst_indices == v)
-        if not mask.any():
-            return default
-        if self.edge_keys is None:
-            if self.edge_indices is None:
-                self._calculate_edge_indices()
-            if key is not None:
-                try:
-                    mask = mask & (self.edge_indices == key)
-                except TypeError:
-                    return default
-        indices = cp.nonzero(mask)[0]
-        if indices.size == 0:
-            return default
-        edge_keys = self.edge_keys
-        if key is not None and edge_keys is not None:
-            mask[[i for i in indices.tolist() if edge_keys[i] != key]] = False
-            indices = cp.nonzero(mask)[0]
-            if indices.size == 0:
-                return default
-        if key is not None:
-            [index] = indices.tolist()
-            return {
-                k: v[index].tolist()
-                for k, v in self.edge_values.items()
-                if k not in self.edge_masks or self.edge_masks[k][index]
-            }
-        return {
-            edge_keys[index] if edge_keys is not None else index: {
-                k: v[index].tolist()
-                for k, v in self.edge_values.items()
-                if k not in self.edge_masks or self.edge_masks[k][index]
-            }
-            for index in indices.tolist()
-        }
-
-    @networkx_api
-    def has_edge(self, u: NodeKey, v: NodeKey, key: EdgeKey | None = None) -> bool:
-        if self.key_to_id is not None:
-            try:
-                u = self.key_to_id[u]
-                v = self.key_to_id[v]
-            except KeyError:
-                return False
-        mask = (self.src_indices == u) & (self.dst_indices == v)
-        if key is None or (self.edge_indices is None and self.edge_keys is None):
-            return bool(mask.any())
-        if self.edge_keys is None:
-            try:
-                return bool((mask & (self.edge_indices == key)).any())
-            except TypeError:
-                return False
-        indices = cp.nonzero(mask)[0]
-        if indices.size == 0:
-            return False
-        edge_keys = self.edge_keys
-        return any(edge_keys[i] == key for i in indices.tolist())
-
-    @networkx_api
-    def to_directed(self, as_view: bool = False) -> nxcg.CudaMultiDiGraph:
-        return self._copy(as_view, self.to_directed_class())
-
-    @networkx_api
-    def to_undirected(self, as_view: bool = False) -> CudaMultiGraph:
-        # Does deep copy in networkx
-        return self._copy(as_view, self.to_undirected_class())
-
-    ###################
-    # Private methods #
-    ###################
-
-    def _copy(self, as_view: bool, cls: type[CudaGraph], reverse: bool = False):
-        # DRY warning: see also CudaGraph._copy
-        src_indices = self.src_indices
-        dst_indices = self.dst_indices
-        edge_indices = self.edge_indices
-        edge_values = self.edge_values
-        edge_masks = self.edge_masks
-        node_values = self.node_values
-        node_masks = self.node_masks
-        key_to_id = self.key_to_id
-        id_to_key = None if key_to_id is None else self._id_to_key
-        edge_keys = self.edge_keys
-        if self.__networkx_cache__ is None:
-            __networkx_cache__ = None
-        elif not reverse and cls is self.__class__:
-            __networkx_cache__ = self.__networkx_cache__
-        else:
-            __networkx_cache__ = {}
-        if not as_view:
-            src_indices = src_indices.copy()
-            dst_indices = dst_indices.copy()
-            edge_indices = edge_indices.copy()
-            edge_values = {key: val.copy() for key, val in edge_values.items()}
-            edge_masks = {key: val.copy() for key, val in edge_masks.items()}
-            node_values = {key: val.copy() for key, val in node_values.items()}
-            node_masks = {key: val.copy() for key, val in node_masks.items()}
-            if key_to_id is not None:
-                key_to_id = key_to_id.copy()
-                if id_to_key is not None:
-                    id_to_key = id_to_key.copy()
-            if edge_keys is not None:
-                edge_keys = edge_keys.copy()
-            if __networkx_cache__ is not None:
-                __networkx_cache__ = __networkx_cache__.copy()
-        if reverse:
-            src_indices, dst_indices = dst_indices, src_indices
-        rv = cls.from_coo(
-            self._N,
-            src_indices,
-            dst_indices,
-            edge_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            id_to_key=id_to_key,
-            edge_keys=edge_keys,
-            use_compat_graph=False,
-        )
-        if as_view:
-            rv.graph = self.graph
-        else:
-            rv.graph.update(deepcopy(self.graph))
-        rv.__networkx_cache__ = __networkx_cache__
-        return rv
-
-    def _sort_edge_indices(self, primary="src"):
-        # DRY warning: see also CudaGraph._sort_edge_indices
-        if self.edge_indices is None and self.edge_keys is None:
-            return super()._sort_edge_indices(primary=primary)
-        if primary == "src":
-            if self.edge_indices is None:
-                stacked = (self.dst_indices, self.src_indices)
-            else:
-                stacked = (self.edge_indices, self.dst_indices, self.src_indices)
-        elif primary == "dst":
-            if self.edge_indices is None:
-                stacked = (self.src_indices, self.dst_indices)
-            else:
-                stacked = (self.edge_indices, self.dst_indices, self.src_indices)
-        else:
-            raise ValueError(
-                f'Bad `primary` argument; expected "src" or "dst", got {primary!r}'
-            )
-        indices = cp.lexsort(cp.vstack(stacked))
-        if (cp.diff(indices) > 0).all():
-            # Already sorted
-            return
-        self.src_indices = self.src_indices[indices]
-        self.dst_indices = self.dst_indices[indices]
-        self.edge_values.update(
-            {key: val[indices] for key, val in self.edge_values.items()}
-        )
-        self.edge_masks.update(
-            {key: val[indices] for key, val in self.edge_masks.items()}
-        )
-        if self.edge_indices is not None:
-            self.edge_indices = self.edge_indices[indices]
-        if self.edge_keys is not None:
-            edge_keys = self.edge_keys
-            self.edge_keys = [edge_keys[i] for i in indices.tolist()]
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
deleted file mode 100644
index a872f13ac70..00000000000
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ /dev/null
@@ -1,875 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import functools
-import itertools
-import operator as op
-from collections import Counter, defaultdict
-from collections.abc import Mapping
-from typing import TYPE_CHECKING
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from .utils import index_dtype, networkx_algorithm
-from .utils.misc import _And_NotImplementedError, pairwise
-
-if _nxver >= (3, 4):
-    from networkx.utils.backends import _get_cache_key, _get_from_cache, _set_to_cache
-
-if TYPE_CHECKING:  # pragma: no cover
-    from nx_cugraph.typing import AttrKey, Dtype, EdgeValue, NodeValue, any_ndarray
-
-__all__ = [
-    "from_networkx",
-    "to_networkx",
-    "from_dict_of_lists",
-    "to_dict_of_lists",
-]
-
-concat = itertools.chain.from_iterable
-# A "required" attribute is one that all edges or nodes must have or KeyError is raised
-REQUIRED = ...
-
-
-def _iterate_values(graph, adj, is_dicts, func):
-    # Using `dict.values` is faster and is the common case, but it doesn't always work
-    if is_dicts is not False:
-        it = concat(map(dict.values, adj.values()))
-        if graph is not None and graph.is_multigraph():
-            it = concat(map(dict.values, it))
-        try:
-            return func(it), True
-        except TypeError:
-            if is_dicts is True:
-                raise
-    # May not be regular dicts
-    it = concat(x.values() for x in adj.values())
-    if graph is not None and graph.is_multigraph():
-        it = concat(x.values() for x in it)
-    return func(it), False
-
-
-# Consider adding this to `utils` if it is useful elsewhere
-def _fallback_decorator(func):
-    """Catch and convert exceptions to ``NotImplementedError``; use as a decorator.
-
-    ``nx.NetworkXError`` are raised without being converted. This allows
-    falling back to other backends if, for example, conversion to GPU failed.
-    """
-
-    @functools.wraps(func)
-    def inner(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except nx.NetworkXError:
-            raise
-        except Exception as exc:
-            raise _And_NotImplementedError(exc) from exc
-
-    return inner
-
-
-@_fallback_decorator
-def from_networkx(
-    graph: nx.Graph,
-    edge_attrs: AttrKey | dict[AttrKey, EdgeValue | None] | None = None,
-    edge_dtypes: Dtype | dict[AttrKey, Dtype | None] | None = None,
-    *,
-    node_attrs: AttrKey | dict[AttrKey, NodeValue | None] | None = None,
-    node_dtypes: Dtype | dict[AttrKey, Dtype | None] | None = None,
-    preserve_all_attrs: bool = False,
-    preserve_edge_attrs: bool = False,
-    preserve_node_attrs: bool = False,
-    preserve_graph_attrs: bool = False,
-    as_directed: bool = False,
-    name: str | None = None,
-    graph_name: str | None = None,
-    use_compat_graph: bool | None = False,
-) -> nxcg.Graph | nxcg.CudaGraph:
-    """Convert a networkx graph to nx_cugraph graph; can convert all attributes.
-
-    Parameters
-    ----------
-    G : networkx.Graph
-    edge_attrs : str or dict, optional
-        Dict that maps edge attributes to default values if missing in ``G``.
-        If None, then no edge attributes will be converted.
-        If default value is None, then missing values are handled with a mask.
-        A default value of ``nxcg.convert.REQUIRED`` or ``...`` indicates that
-        all edges have data for this attribute, and raise `KeyError` if not.
-        For convenience, `edge_attrs` may be a single attribute with default 1;
-        for example ``edge_attrs="weight"``.
-    edge_dtypes : dtype or dict, optional
-    node_attrs : str or dict, optional
-        Dict that maps node attributes to default values if missing in ``G``.
-        If None, then no node attributes will be converted.
-        If default value is None, then missing values are handled with a mask.
-        A default value of ``nxcg.convert.REQUIRED`` or ``...`` indicates that
-        all edges have data for this attribute, and raise `KeyError` if not.
-        For convenience, `node_attrs` may be a single attribute with no default;
-        for example ``node_attrs="weight"``.
-    node_dtypes : dtype or dict, optional
-    preserve_all_attrs : bool, default False
-        If True, then equivalent to setting preserve_edge_attrs, preserve_node_attrs,
-        and preserve_graph_attrs to True.
-    preserve_edge_attrs : bool, default False
-        Whether to preserve all edge attributes.
-    preserve_node_attrs : bool, default False
-        Whether to preserve all node attributes.
-    preserve_graph_attrs : bool, default False
-        Whether to preserve all graph attributes.
-    as_directed : bool, default False
-        If True, then the returned graph will be directed regardless of input.
-        If False, then the returned graph type is determined by input graph.
-    name : str, optional
-        The name of the algorithm when dispatched from networkx.
-    graph_name : str, optional
-        The name of the graph argument geing converted when dispatched from networkx.
-    use_compat_graph : bool or None, default False
-        Indicate whether to return a graph that is compatible with NetworkX graph.
-        For example, ``nx_cugraph.Graph`` can be used as a NetworkX graph and can
-        reside in host (CPU) or device (GPU) memory. The default is False, which
-        will return e.g. ``nx_cugraph.CudaGraph`` that only resides on device (GPU)
-        and is not fully compatible as a NetworkX graph.
-
-    Returns
-    -------
-    nx_cugraph.Graph or nx_cugraph.CudaGraph
-
-    Notes
-    -----
-    For optimal performance, be as specific as possible about what is being converted:
-
-    1. Do you need edge values? Creating a graph with just the structure is the fastest.
-    2. Do you know the edge attribute(s) you need? Specify with `edge_attrs`.
-    3. Do you know the default values? Specify with ``edge_attrs={weight: default}``.
-    4. Do you know if all edges have values? Specify with ``edge_attrs={weight: ...}``.
-    5. Do you know the dtype of attributes? Specify with `edge_dtypes`.
-
-    Conversely, using ``preserve_edge_attrs=True`` or ``preserve_all_attrs=True`` are
-    the slowest, but are also the most flexible and generic.
-
-    See Also
-    --------
-    to_networkx : The opposite; convert nx_cugraph graph to networkx graph
-    """
-    # This uses `graph._adj` and `graph._node`, which are private attributes in NetworkX
-    if not isinstance(graph, nx.Graph):
-        if isinstance(graph, nx.classes.reportviews.NodeView):
-            # Convert to a Graph with only nodes (no edges)
-            G = nx.Graph()
-            G.add_nodes_from(graph.items())
-            graph = G
-        else:
-            raise TypeError(f"Expected networkx.Graph; got {type(graph)}")
-    elif isinstance(graph, nxcg.Graph):
-        if (
-            use_compat_graph
-            # Use compat graphs by default
-            or use_compat_graph is None
-            and (_nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs)
-        ):
-            return graph
-        if graph._is_on_gpu:
-            return graph._cudagraph
-        if not graph._is_on_cpu:
-            raise RuntimeError(
-                f"{type(graph).__name__} cannot be converted to the GPU, because it is "
-                "not on the CPU! This is not supposed to be possible. If you believe "
-                "you have found a bug, please report a minimum reproducible example to "
-                "https://github.com/rapidsai/cugraph/issues/new/choose"
-            )
-        if _nxver >= (3, 4):
-            cache_key = _get_cache_key(
-                edge_attrs=edge_attrs,
-                node_attrs=node_attrs,
-                preserve_edge_attrs=preserve_edge_attrs,
-                preserve_node_attrs=preserve_node_attrs,
-                preserve_graph_attrs=preserve_graph_attrs,
-            )
-            cache = getattr(graph, "__networkx_cache__", None)
-            if cache is not None:
-                cache = cache.setdefault("backends", {}).setdefault("cugraph", {})
-                compat_key, rv = _get_from_cache(cache, cache_key)
-                if rv is not None:
-                    if isinstance(rv, nxcg.Graph):
-                        # This shouldn't happen during normal use, but be extra-careful
-                        rv = rv._cudagraph
-                    if rv is not None:
-                        return rv
-
-    if preserve_all_attrs:
-        preserve_edge_attrs = True
-        preserve_node_attrs = True
-        preserve_graph_attrs = True
-
-    if edge_attrs is not None:
-        if isinstance(edge_attrs, Mapping):
-            # Copy so we don't mutate the original
-            edge_attrs = dict(edge_attrs)
-        else:
-            edge_attrs = {edge_attrs: 1}
-
-    if node_attrs is not None:
-        if isinstance(node_attrs, Mapping):
-            # Copy so we don't mutate the original
-            node_attrs = dict(node_attrs)
-        else:
-            node_attrs = {node_attrs: None}
-
-    if graph.__class__ in {
-        nx.Graph,
-        nx.DiGraph,
-        nx.MultiGraph,
-        nx.MultiDiGraph,
-    } or isinstance(graph, nxcg.Graph):
-        # This is a NetworkX private attribute, but is much faster to use
-        adj = graph._adj
-    else:
-        adj = graph.adj
-    if isinstance(adj, nx.classes.coreviews.FilterAdjacency):
-        adj = {k: dict(v) for k, v in adj.items()}
-
-    is_dicts = None
-    N = len(adj)
-    if (
-        not preserve_edge_attrs
-        and not edge_attrs
-        # Faster than graph.number_of_edges() == 0
-        or next(concat(rowdata.values() for rowdata in adj.values()), None) is None
-    ):
-        # Either we weren't asked to preserve edge attributes, or there are no edges
-        edge_attrs = None
-    elif preserve_edge_attrs:
-        attr_sets, is_dicts = _iterate_values(
-            graph, adj, is_dicts, lambda it: set(map(frozenset, it))
-        )
-        attrs = frozenset.union(*attr_sets)
-        edge_attrs = dict.fromkeys(attrs, REQUIRED)
-        if len(attr_sets) > 1:
-            # Determine which edges have missing data
-            for attr, count in Counter(concat(attr_sets)).items():
-                if count != len(attr_sets):
-                    edge_attrs[attr] = None
-    elif None in edge_attrs.values():
-        # Required edge attributes have a default of None in `edge_attrs`
-        # Verify all edge attributes are present!
-        required = frozenset(
-            attr for attr, default in edge_attrs.items() if default is None
-        )
-        if len(required) == 1:
-            # Fast path for the common case of a single attribute with no default
-            [attr] = required
-            if graph.is_multigraph():
-                it = (
-                    attr in edgedata
-                    for rowdata in adj.values()
-                    for multiedges in rowdata.values()
-                    for edgedata in multiedges.values()
-                )
-            else:
-                it = (
-                    attr in edgedata
-                    for rowdata in adj.values()
-                    for edgedata in rowdata.values()
-                )
-            if next(it):
-                if all(it):
-                    # All edges have data
-                    edge_attrs[attr] = REQUIRED
-                # Else some edges have attribute (default already None)
-            elif not any(it):
-                # No edges have attribute
-                del edge_attrs[attr]
-            # Else some edges have attribute (default already None)
-        else:
-            attr_sets, is_dicts = _iterate_values(
-                graph, adj, is_dicts, lambda it: set(map(required.intersection, it))
-            )
-            for attr in required - frozenset.union(*attr_sets):
-                # No edges have these attributes
-                del edge_attrs[attr]
-            for attr in frozenset.intersection(*attr_sets):
-                # All edges have these attributes
-                edge_attrs[attr] = REQUIRED
-
-    if N == 0:
-        node_attrs = None
-    elif preserve_node_attrs:
-        attr_sets = set(map(frozenset, graph._node.values()))
-        attrs = frozenset.union(*attr_sets)
-        node_attrs = dict.fromkeys(attrs, REQUIRED)
-        if len(attr_sets) > 1:
-            # Determine which nodes have missing data
-            for attr, count in Counter(concat(attr_sets)).items():
-                if count != len(attr_sets):
-                    node_attrs[attr] = None
-    elif node_attrs and None in node_attrs.values():
-        # Required node attributes have a default of None in `node_attrs`
-        # Verify all node attributes are present!
-        required = frozenset(
-            attr for attr, default in node_attrs.items() if default is None
-        )
-        if len(required) == 1:
-            # Fast path for the common case of a single attribute with no default
-            [attr] = required
-            it = (attr in nodedata for nodedata in graph._node.values())
-            if next(it):
-                if all(it):
-                    # All nodes have data
-                    node_attrs[attr] = REQUIRED
-                # Else some nodes have attribute (default already None)
-            elif not any(it):
-                # No nodes have attribute
-                del node_attrs[attr]
-            # Else some nodes have attribute (default already None)
-        else:
-            attr_sets = set(map(required.intersection, graph._node.values()))
-            for attr in required - frozenset.union(*attr_sets):
-                # No nodes have these attributes
-                del node_attrs[attr]
-            for attr in frozenset.intersection(*attr_sets):
-                # All nodes have these attributes
-                node_attrs[attr] = REQUIRED
-
-    key_to_id = dict(zip(adj, range(N)))
-    dst_iter = concat(adj.values())
-    try:
-        no_renumber = all(k == v for k, v in key_to_id.items())
-    except Exception:
-        no_renumber = False
-    if no_renumber:
-        key_to_id = None
-    else:
-        dst_iter = map(key_to_id.__getitem__, dst_iter)
-    if graph.is_multigraph():
-        dst_indices = np.fromiter(dst_iter, index_dtype)
-        num_multiedges, is_dicts = _iterate_values(
-            None, adj, is_dicts, lambda it: np.fromiter(map(len, it), index_dtype)
-        )
-        # cp.repeat is slow to use here, so use numpy instead
-        dst_indices = cp.array(np.repeat(dst_indices, num_multiedges))
-        # Determine edge keys and edge ids for multigraphs
-        if is_dicts:
-            edge_keys = list(concat(concat(map(dict.values, adj.values()))))
-            it = concat(map(dict.values, adj.values()))
-        else:
-            edge_keys = list(concat(concat(x.values() for x in adj.values())))
-            it = concat(x.values() for x in adj.values())
-        edge_indices = cp.fromiter(concat(map(range, map(len, it))), index_dtype)
-        if edge_keys == edge_indices.tolist():
-            edge_keys = None  # Prefer edge_indices
-    else:
-        dst_indices = cp.fromiter(dst_iter, index_dtype)
-
-    edge_values = {}
-    edge_masks = {}
-    if edge_attrs:
-        if edge_dtypes is None:
-            edge_dtypes = {}
-        elif not isinstance(edge_dtypes, Mapping):
-            edge_dtypes = dict.fromkeys(edge_attrs, edge_dtypes)
-        for edge_attr, edge_default in edge_attrs.items():
-            dtype = edge_dtypes.get(edge_attr)
-            if edge_default is None:
-                vals = []
-                append = vals.append
-                if graph.is_multigraph():
-                    iter_mask = (
-                        append(
-                            edgedata[edge_attr]
-                            if (present := edge_attr in edgedata)
-                            else False
-                        )
-                        or present
-                        for rowdata in adj.values()
-                        for multiedges in rowdata.values()
-                        for edgedata in multiedges.values()
-                    )
-                else:
-                    iter_mask = (
-                        append(
-                            edgedata[edge_attr]
-                            if (present := edge_attr in edgedata)
-                            else False
-                        )
-                        or present
-                        for rowdata in adj.values()
-                        for edgedata in rowdata.values()
-                    )
-                edge_masks[edge_attr] = cp.fromiter(iter_mask, bool)
-                edge_values[edge_attr] = cp.array(vals, dtype)
-                # if vals.ndim > 1: ...
-            elif edge_default is REQUIRED:
-                if dtype is None:
-
-                    def func(it, edge_attr=edge_attr):
-                        return cp.array(list(map(op.itemgetter(edge_attr), it)))
-
-                else:
-
-                    def func(it, edge_attr=edge_attr, dtype=dtype):
-                        return cp.fromiter(map(op.itemgetter(edge_attr), it), dtype)
-
-                edge_value, is_dicts = _iterate_values(graph, adj, is_dicts, func)
-                edge_values[edge_attr] = edge_value
-            else:
-                if graph.is_multigraph():
-                    iter_values = (
-                        edgedata.get(edge_attr, edge_default)
-                        for rowdata in adj.values()
-                        for multiedges in rowdata.values()
-                        for edgedata in multiedges.values()
-                    )
-                else:
-                    iter_values = (
-                        edgedata.get(edge_attr, edge_default)
-                        for rowdata in adj.values()
-                        for edgedata in rowdata.values()
-                    )
-                if dtype is None:
-                    edge_values[edge_attr] = cp.array(list(iter_values))
-                else:
-                    edge_values[edge_attr] = cp.fromiter(iter_values, dtype)
-            # if vals.ndim > 1: ...
-
-    # cp.repeat is slow to use here, so use numpy instead
-    src_indices = np.repeat(
-        np.arange(N, dtype=index_dtype),
-        np.fromiter(map(len, adj.values()), index_dtype),
-    )
-    if graph.is_multigraph():
-        src_indices = np.repeat(src_indices, num_multiedges)
-    src_indices = cp.array(src_indices)
-
-    node_values = {}
-    node_masks = {}
-    if node_attrs:
-        nodes = graph._node
-        if node_dtypes is None:
-            node_dtypes = {}
-        elif not isinstance(node_dtypes, Mapping):
-            node_dtypes = dict.fromkeys(node_attrs, node_dtypes)
-        for node_attr, node_default in node_attrs.items():
-            # Iterate over `adj` to ensure consistent order
-            dtype = node_dtypes.get(node_attr)
-            if node_default is None:
-                vals = []
-                append = vals.append
-                iter_mask = (
-                    append(
-                        nodedata[node_attr]
-                        if (present := node_attr in (nodedata := nodes[node_id]))
-                        else False
-                    )
-                    or present
-                    for node_id in adj
-                )
-                # Node values may be numpy or cupy arrays (useful for str, object, etc).
-                # Someday we'll let the user choose np or cp, and support edge values.
-                node_mask = np.fromiter(iter_mask, bool)
-                try:
-                    node_value = np.array(vals, dtype)
-                except ValueError:
-                    # Handle e.g. list elements
-                    if dtype is None or dtype == object:
-                        node_value = np.fromiter(vals, object)
-                    else:
-                        raise
-                else:
-                    try:
-                        node_value = cp.array(node_value)
-                    except ValueError:
-                        pass
-                    else:
-                        node_mask = cp.array(node_mask)
-                node_values[node_attr] = node_value
-                node_masks[node_attr] = node_mask
-                # if vals.ndim > 1: ...
-            else:
-                if node_default is REQUIRED:
-                    iter_values = (nodes[node_id][node_attr] for node_id in adj)
-                else:
-                    iter_values = (
-                        nodes[node_id].get(node_attr, node_default) for node_id in adj
-                    )
-                # Node values may be numpy or cupy arrays (useful for str, object, etc).
-                # Someday we'll let the user choose np or cp, and support edge values.
-                if dtype is None:
-                    vals = list(iter_values)
-                    try:
-                        node_value = np.array(vals)
-                    except ValueError:
-                        # Handle e.g. list elements
-                        node_value = np.fromiter(vals, object)
-                else:
-                    node_value = np.fromiter(iter_values, dtype)
-                try:
-                    node_value = cp.array(node_value)
-                except ValueError:
-                    pass
-                node_values[node_attr] = node_value
-                # if vals.ndim > 1: ...
-    if graph.is_multigraph():
-        if graph.is_directed() or as_directed:
-            klass = nxcg.CudaMultiDiGraph
-        else:
-            klass = nxcg.CudaMultiGraph
-        rv = klass.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            edge_keys=edge_keys,
-            use_compat_graph=False,
-        )
-    else:
-        if graph.is_directed() or as_directed:
-            klass = nxcg.CudaDiGraph
-        else:
-            klass = nxcg.CudaGraph
-        rv = klass.from_coo(
-            N,
-            src_indices,
-            dst_indices,
-            edge_values,
-            edge_masks,
-            node_values,
-            node_masks,
-            key_to_id=key_to_id,
-            use_compat_graph=False,
-        )
-    if preserve_graph_attrs:
-        rv.graph.update(graph.graph)  # deepcopy?
-    if _nxver >= (3, 4) and isinstance(graph, nxcg.Graph) and cache is not None:
-        # Make sure this conversion is added to the cache, and make all of
-        # our graphs share the same `.graph` attribute for consistency.
-        rv.graph = graph.graph
-        _set_to_cache(cache, cache_key, rv)
-    if (
-        use_compat_graph
-        # Use compat graphs by default
-        or use_compat_graph is None
-        and (_nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs)
-    ):
-        return rv._to_compat_graph()
-    return rv
-
-
-def _to_tuples(ndim, L):
-    if ndim > 2:
-        L = list(map(_to_tuples.__get__(ndim - 1), L))
-    return list(map(tuple, L))
-
-
-def _array_to_tuples(a):
-    """Like ``a.tolist()``, but nested structures are tuples instead of lists.
-
-    This is only different from ``a.tolist()`` if ``a.ndim > 1``. It is used to
-    try to return tuples instead of lists for e.g. node values.
-    """
-    if a.ndim > 1:
-        return _to_tuples(a.ndim, a.tolist())
-    return a.tolist()
-
-
-def _iter_attr_dicts(
-    values: dict[AttrKey, any_ndarray[EdgeValue | NodeValue]],
-    masks: dict[AttrKey, any_ndarray[bool]],
-):
-    full_attrs = list(values.keys() - masks.keys())
-    if full_attrs:
-        full_dicts = (
-            dict(zip(full_attrs, vals))
-            for vals in zip(*(_array_to_tuples(values[attr]) for attr in full_attrs))
-        )
-    partial_attrs = list(values.keys() & masks.keys())
-    if partial_attrs:
-        partial_dicts = (
-            {k: v for k, (v, m) in zip(partial_attrs, vals_masks) if m}
-            for vals_masks in zip(
-                *(
-                    zip(values[attr].tolist(), masks[attr].tolist())
-                    for attr in partial_attrs
-                )
-            )
-        )
-    if full_attrs and partial_attrs:
-        full_dicts = (d1.update(d2) or d1 for d1, d2 in zip(full_dicts, partial_dicts))
-    elif partial_attrs:
-        full_dicts = partial_dicts
-    return full_dicts
-
-
-def to_networkx(
-    G: nxcg.Graph | nxcg.CudaGraph, *, sort_edges: bool = False
-) -> nx.Graph:
-    """Convert a nx_cugraph graph to networkx graph.
-
-    All edge and node attributes and ``G.graph`` properties are converted.
-
-    Parameters
-    ----------
-    G : nx_cugraph.Graph or nx_cugraph.CudaGraph
-    sort_edges : bool, default False
-        Whether to sort the edge data of the input graph by (src, dst) indices
-        before converting. This can be useful to convert to networkx graphs
-        that iterate over edges consistently since edges are stored in dicts
-        in the order they were added.
-
-    Returns
-    -------
-    networkx.Graph
-
-    See Also
-    --------
-    from_networkx : The opposite; convert networkx graph to nx_cugraph graph
-    """
-    if isinstance(G, nxcg.Graph):
-        # These graphs are already NetworkX graphs :)
-        return G
-    rv = G.to_networkx_class()()
-    id_to_key = G.id_to_key
-    if sort_edges:
-        G._sort_edge_indices()
-
-    node_values = G.node_values
-    node_masks = G.node_masks
-    if node_values:
-        node_iter = range(len(G))
-        if id_to_key is not None:
-            node_iter = map(id_to_key.__getitem__, node_iter)
-        full_node_dicts = _iter_attr_dicts(node_values, node_masks)
-        rv.add_nodes_from(zip(node_iter, full_node_dicts))
-    elif id_to_key is not None:
-        rv.add_nodes_from(id_to_key)
-    else:
-        rv.add_nodes_from(range(len(G)))
-
-    src_indices = G.src_indices
-    dst_indices = G.dst_indices
-    edge_values = G.edge_values
-    edge_masks = G.edge_masks
-    if not G.is_directed():
-        # Only add upper triangle of the adjacency matrix so we don't double-add edges
-        mask = src_indices <= dst_indices
-        src_indices = src_indices[mask]
-        dst_indices = dst_indices[mask]
-        if edge_values:
-            edge_values = {k: v[mask] for k, v in edge_values.items()}
-        if edge_masks:
-            edge_masks = {k: v[mask] for k, v in edge_masks.items()}
-    src_indices = src_iter = src_indices.tolist()
-    dst_indices = dst_iter = dst_indices.tolist()
-    if id_to_key is not None:
-        src_iter = map(id_to_key.__getitem__, src_indices)
-        dst_iter = map(id_to_key.__getitem__, dst_indices)
-    if G.is_multigraph() and (G.edge_keys is not None or G.edge_indices is not None):
-        if G.edge_keys is not None:
-            if not G.is_directed():
-                edge_keys = [k for k, m in zip(G.edge_keys, mask.tolist()) if m]
-            else:
-                edge_keys = G.edge_keys
-        elif not G.is_directed():
-            edge_keys = G.edge_indices[mask].tolist()
-        else:
-            edge_keys = G.edge_indices.tolist()
-        if edge_values:
-            full_edge_dicts = _iter_attr_dicts(edge_values, edge_masks)
-            rv.add_edges_from(zip(src_iter, dst_iter, edge_keys, full_edge_dicts))
-        else:
-            rv.add_edges_from(zip(src_iter, dst_iter, edge_keys))
-    elif edge_values:
-        full_edge_dicts = _iter_attr_dicts(edge_values, edge_masks)
-        rv.add_edges_from(zip(src_iter, dst_iter, full_edge_dicts))
-    else:
-        rv.add_edges_from(zip(src_iter, dst_iter))
-
-    rv.graph.update(G.graph)
-    return rv
-
-
-def _to_graph(
-    G,
-    edge_attr: AttrKey | None = None,
-    edge_default: EdgeValue | None = 1,
-    edge_dtype: Dtype | None = None,
-) -> nxcg.CudaGraph | nxcg.CudaDiGraph:
-    """Ensure that input type is a nx_cugraph graph, and convert if necessary.
-
-    Directed and undirected graphs are both allowed.
-    This is an internal utility function and may change or be removed.
-    """
-    if isinstance(G, nxcg.CudaGraph):
-        return G
-    if isinstance(G, nx.Graph):
-        return from_networkx(
-            G, {edge_attr: edge_default} if edge_attr is not None else None, edge_dtype
-        )
-    # TODO: handle cugraph.Graph
-    raise TypeError
-
-
-def _to_directed_graph(
-    G,
-    edge_attr: AttrKey | None = None,
-    edge_default: EdgeValue | None = 1,
-    edge_dtype: Dtype | None = None,
-) -> nxcg.CudaDiGraph:
-    """Ensure that input type is a nx_cugraph CudaDiGraph, and convert if necessary.
-
-    Undirected graphs will be converted to directed.
-    This is an internal utility function and may change or be removed.
-    """
-    if isinstance(G, nxcg.CudaDiGraph):
-        return G
-    if isinstance(G, nxcg.CudaGraph):
-        return G.to_directed()
-    if isinstance(G, nx.Graph):
-        return from_networkx(
-            G,
-            {edge_attr: edge_default} if edge_attr is not None else None,
-            edge_dtype,
-            as_directed=True,
-        )
-    # TODO: handle cugraph.Graph
-    raise TypeError
-
-
-def _to_undirected_graph(
-    G,
-    edge_attr: AttrKey | None = None,
-    edge_default: EdgeValue | None = 1,
-    edge_dtype: Dtype | None = None,
-) -> nxcg.CudaGraph:
-    """Ensure that input type is a nx_cugraph CudaGraph, and convert if necessary.
-
-    Only undirected graphs are allowed. Directed graphs will raise ValueError.
-    This is an internal utility function and may change or be removed.
-    """
-    if isinstance(G, nxcg.CudaGraph):
-        if G.is_directed():
-            raise ValueError("Only undirected graphs supported; got a directed graph")
-        return G
-    if isinstance(G, nx.Graph):
-        return from_networkx(
-            G, {edge_attr: edge_default} if edge_attr is not None else None, edge_dtype
-        )
-    # TODO: handle cugraph.Graph
-    raise TypeError
-
-
-@networkx_algorithm(version_added="24.08", fallback=True)
-def from_dict_of_lists(d, create_using=None):
-    from .generators._utils import _create_using_class
-
-    graph_class, inplace = _create_using_class(create_using)
-    key_to_id = defaultdict(itertools.count().__next__)
-    src_indices = cp.array(
-        # cp.repeat is slow to use here, so use numpy instead
-        np.repeat(
-            np.fromiter(map(key_to_id.__getitem__, d), index_dtype),
-            np.fromiter(map(len, d.values()), index_dtype),
-        )
-    )
-    dst_indices = cp.fromiter(
-        map(key_to_id.__getitem__, concat(d.values())), index_dtype
-    )
-    # Initialize as directed first them symmetrize if undirected.
-    G = graph_class.to_directed_class().from_coo(
-        len(key_to_id),
-        src_indices,
-        dst_indices,
-        key_to_id=key_to_id,
-    )
-    if not graph_class.is_directed():
-        G = G.to_undirected()
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="24.08")
-def to_dict_of_lists(G, nodelist=None):
-    G = _to_graph(G)
-    src_indices = G.src_indices
-    dst_indices = G.dst_indices
-    if nodelist is not None:
-        try:
-            node_ids = G._nodekeys_to_nodearray(nodelist)
-        except KeyError as exc:
-            gname = "digraph" if G.is_directed() else "graph"
-            raise nx.NetworkXError(
-                f"The node {exc.args[0]} is not in the {gname}."
-            ) from exc
-        mask = cp.isin(src_indices, node_ids) & cp.isin(dst_indices, node_ids)
-        src_indices = src_indices[mask]
-        dst_indices = dst_indices[mask]
-    # Sort indices so we can use `cp.unique` to determine boundaries.
-    # This is like exporting to DCSR.
-    if G.is_multigraph():
-        stacked = cp.unique(cp.vstack((src_indices, dst_indices)), axis=1)
-        src_indices = stacked[0]
-        dst_indices = stacked[1]
-    else:
-        stacked = cp.vstack((dst_indices, src_indices))
-        indices = cp.lexsort(stacked)
-        src_indices = src_indices[indices]
-        dst_indices = dst_indices[indices]
-    compressed_srcs, left_bounds = cp.unique(src_indices, return_index=True)
-    # Ensure we include isolate nodes in the result (and in proper order)
-    rv = None
-    if nodelist is not None:
-        if compressed_srcs.size != len(nodelist):
-            if G.key_to_id is None:
-                # `G._nodekeys_to_nodearray` does not check for valid node keys.
-                container = range(G._N)
-                for key in nodelist:
-                    if key not in container:
-                        gname = "digraph" if G.is_directed() else "graph"
-                        raise nx.NetworkXError(f"The node {key} is not in the {gname}.")
-            rv = {key: [] for key in nodelist}
-    elif compressed_srcs.size != G._N:
-        rv = {key: [] for key in G}
-    # We use `boundaries` like this in `_groupby` too
-    boundaries = pairwise(itertools.chain(left_bounds.tolist(), [src_indices.size]))
-    dst_indices = dst_indices.tolist()
-    if G.key_to_id is None:
-        it = zip(compressed_srcs.tolist(), boundaries)
-        if rv is None:
-            return {src: dst_indices[start:end] for src, (start, end) in it}
-        rv.update((src, dst_indices[start:end]) for src, (start, end) in it)
-        return rv
-    to_key = G.id_to_key.__getitem__
-    it = zip(compressed_srcs.tolist(), boundaries)
-    if rv is None:
-        return {
-            to_key(src): list(map(to_key, dst_indices[start:end]))
-            for src, (start, end) in it
-        }
-    rv.update(
-        (to_key(src), list(map(to_key, dst_indices[start:end])))
-        for src, (start, end) in it
-    )
-    return rv
diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py
deleted file mode 100644
index 54975902861..00000000000
--- a/python/nx-cugraph/nx_cugraph/convert_matrix.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-from nx_cugraph import _nxver
-
-from .generators._utils import _create_using_class
-from .utils import _cp_iscopied_asarray, index_dtype, networkx_algorithm
-
-__all__ = [
-    "from_pandas_edgelist",
-    "from_scipy_sparse_array",
-]
-
-
-# Value columns with string dtype is not supported
-@networkx_algorithm(is_incomplete=True, version_added="23.12", fallback=True)
-def from_pandas_edgelist(
-    df,
-    source="source",
-    target="target",
-    edge_attr=None,
-    create_using=None,
-    edge_key=None,
-):
-    """cudf.DataFrame inputs also supported; value columns with str is unsuppported."""
-    # This function never shares ownership of the underlying arrays of the DataFrame
-    # columns. We will perform a copy if necessary even if given e.g. a cudf.DataFrame.
-    graph_class, inplace = _create_using_class(create_using)
-    # Try to be optimal whether using pandas, cudf, or cudf.pandas
-    src_series = df[source]
-    dst_series = df[target]
-    try:
-        # Optimistically try to use cupy, but fall back to numpy if necessary
-        src_array = src_series.to_cupy()
-        dst_array = dst_series.to_cupy()
-    except (AttributeError, TypeError, ValueError, NotImplementedError):
-        src_array = src_series.to_numpy()
-        dst_array = dst_series.to_numpy()
-    try:
-        # Minimize unnecessary data copies by tracking whether we copy or not
-        is_src_copied, src_array = _cp_iscopied_asarray(
-            src_array, orig_object=src_series
-        )
-        is_dst_copied, dst_array = _cp_iscopied_asarray(
-            dst_array, orig_object=dst_series
-        )
-        np_or_cp = cp
-    except ValueError:
-        is_src_copied = is_dst_copied = False
-        src_array = np.asarray(src_array)
-        dst_array = np.asarray(dst_array)
-        np_or_cp = np
-    # TODO: create renumbering helper function(s)
-    # Renumber step 0: node keys
-    nodes = np_or_cp.unique(np_or_cp.concatenate([src_array, dst_array]))
-    N = nodes.size
-    kwargs = {}
-    if N > 0 and (
-        nodes[0] != 0
-        or nodes[N - 1] != N - 1
-        or (
-            nodes.dtype.kind not in {"i", "u"}
-            and not (nodes == np_or_cp.arange(N, dtype=np.int64)).all()
-        )
-    ):
-        # We need to renumber indices--np_or_cp.searchsorted to the rescue!
-        kwargs["id_to_key"] = nodes.tolist()
-        src_indices = cp.asarray(np_or_cp.searchsorted(nodes, src_array), index_dtype)
-        dst_indices = cp.asarray(np_or_cp.searchsorted(nodes, dst_array), index_dtype)
-    else:
-        # Copy if necessary so we don't share ownership of input arrays.
-        if is_src_copied:
-            src_indices = src_array
-        else:
-            src_indices = cp.array(src_array)
-        if is_dst_copied:
-            dst_indices = dst_array
-        else:
-            dst_indices = cp.array(dst_array)
-
-    if not graph_class.is_directed():
-        # Symmetrize the edges
-        mask = src_indices != dst_indices
-        if mask.all():
-            mask = None
-        src_indices, dst_indices = (
-            cp.hstack(
-                (src_indices, dst_indices[mask] if mask is not None else dst_indices)
-            ),
-            cp.hstack(
-                (dst_indices, src_indices[mask] if mask is not None else src_indices)
-            ),
-        )
-
-    if edge_attr is not None:
-        # Additional columns requested for edge data
-        if edge_attr is True:
-            attr_col_headings = df.columns.difference({source, target}).to_list()
-        elif isinstance(edge_attr, (list, tuple)):
-            attr_col_headings = edge_attr
-        else:
-            attr_col_headings = [edge_attr]
-        if len(attr_col_headings) == 0:
-            raise nx.NetworkXError(
-                "Invalid edge_attr argument: No columns found with name: "
-                f"{attr_col_headings}"
-            )
-        try:
-            edge_values = {
-                key: cp.array(val.to_numpy())
-                for key, val in df[attr_col_headings].items()
-            }
-        except (KeyError, TypeError) as exc:
-            raise nx.NetworkXError(f"Invalid edge_attr argument: {edge_attr}") from exc
-
-        if not graph_class.is_directed():
-            # Symmetrize the edges
-            edge_values = {
-                key: cp.hstack((val, val[mask] if mask is not None else val))
-                for key, val in edge_values.items()
-            }
-        kwargs["edge_values"] = edge_values
-
-    if (
-        graph_class.is_multigraph()
-        and edge_key is not None
-        and (
-            # In nx <= 3.3, `edge_key` was ignored if `edge_attr` is None
-            edge_attr is not None
-            or _nxver > (3, 3)
-        )
-    ):
-        try:
-            edge_keys = df[edge_key].to_list()
-        except (KeyError, TypeError) as exc:
-            raise nx.NetworkXError(f"Invalid edge_key argument: {edge_key}") from exc
-        if not graph_class.is_directed():
-            # Symmetrize the edges; remember, `edge_keys` is a list!
-            if mask is None:
-                edge_keys *= 2
-            else:
-                edge_keys += [
-                    key for keep, key in zip(mask.tolist(), edge_keys) if keep
-                ]
-        kwargs["edge_keys"] = edge_keys
-
-    G = graph_class.from_coo(N, src_indices, dst_indices, **kwargs)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12", fallback=True)
-def from_scipy_sparse_array(
-    A, parallel_edges=False, create_using=None, edge_attribute="weight"
-):
-    graph_class, inplace = _create_using_class(create_using)
-    m, n = A.shape
-    if m != n:
-        raise nx.NetworkXError(f"Adjacency matrix not square: nx,ny={A.shape}")
-    if A.format != "coo":
-        A = A.tocoo()
-    if A.dtype.kind in {"i", "u"} and graph_class.is_multigraph() and parallel_edges:
-        src_indices = cp.array(np.repeat(A.row, A.data), index_dtype)
-        dst_indices = cp.array(np.repeat(A.col, A.data), index_dtype)
-        weight = cp.empty(src_indices.size, A.data.dtype)
-        weight[:] = 1
-    else:
-        src_indices = cp.array(A.row, index_dtype)
-        dst_indices = cp.array(A.col, index_dtype)
-        weight = cp.array(A.data)
-    G = graph_class.from_coo(
-        n, src_indices, dst_indices, edge_values={"weight": weight}
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
diff --git a/python/nx-cugraph/nx_cugraph/generators/__init__.py b/python/nx-cugraph/nx_cugraph/generators/__init__.py
deleted file mode 100644
index 60a9d92373a..00000000000
--- a/python/nx-cugraph/nx_cugraph/generators/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .classic import *
-from .community import *
-from .ego import *
-from .small import *
-from .social import *
diff --git a/python/nx-cugraph/nx_cugraph/generators/_utils.py b/python/nx-cugraph/nx_cugraph/generators/_utils.py
deleted file mode 100644
index bc9ab84bdad..00000000000
--- a/python/nx-cugraph/nx_cugraph/generators/_utils.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import operator as op
-
-import cupy as cp
-import networkx as nx
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from ..utils import index_dtype
-
-# 3.2.1 fixed some issues in generators that occur in 3.2 and earlier
-_IS_NX32_OR_LESS = (nxver := nx.__version__)[:3] <= "3.2" and (
-    len(nxver) <= 3 or nxver[3] != "." and not nxver[3].isdigit()
-)
-
-
-def _ensure_int(n):
-    """Ensure n is integral."""
-    return op.index(n)
-
-
-def _ensure_nonnegative_int(n):
-    """Ensure n is a nonnegative integer."""
-    n = op.index(n)
-    if n < 0:
-        raise nx.NetworkXError(f"Negative number of nodes not valid: {n}")
-    return n
-
-
-def _complete_graph_indices(n):
-    all_indices = cp.indices((n, n), dtype=index_dtype)
-    src_indices = all_indices[0].ravel()
-    dst_indices = all_indices[1].ravel()
-    del all_indices
-    mask = src_indices != dst_indices
-    return (src_indices[mask], dst_indices[mask])
-
-
-def _common_small_graph(n, nodes, create_using, *, allow_directed=True):
-    """Create a "common graph" for small n.
-
-    n == 0: empty graph
-    n == 1: empty graph
-    n == 2: complete graph
-    n > 2: undefined
-    """
-    graph_class, inplace = _create_using_class(create_using)
-    if not allow_directed and graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    if n < 2:
-        G = graph_class.from_coo(
-            n, cp.empty(0, index_dtype), cp.empty(0, index_dtype), id_to_key=nodes
-        )
-    else:
-        G = graph_class.from_coo(
-            n,
-            cp.arange(2, dtype=index_dtype),
-            cp.array([1, 0], index_dtype),
-            id_to_key=nodes,
-        )
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-def _create_using_class(create_using, *, default=nx.Graph):
-    """Handle ``create_using`` argument and return a Graph type from nx_cugraph."""
-    inplace = False
-    if create_using is None:
-        G = default()
-    elif isinstance(create_using, type):
-        G = create_using()
-    elif not hasattr(create_using, "is_directed") or not hasattr(
-        create_using, "is_multigraph"
-    ):
-        raise TypeError("create_using is not a valid graph type or instance")
-    elif not isinstance(create_using, (nxcg.Graph, nxcg.CudaGraph)):
-        raise NotImplementedError(
-            f"create_using with object of type {type(create_using)} is not supported "
-            "by the cugraph backend; only nx_cugraph.Graph or nx_cugraph.CudaGraph "
-            "objects are allowed."
-        )
-    else:
-        inplace = True
-        G = create_using
-        G.clear()
-    if not isinstance(G, (nxcg.Graph, nxcg.CudaGraph)):
-        if G.is_multigraph():
-            if G.is_directed():
-                graph_class = nxcg.MultiDiGraph
-            else:
-                graph_class = nxcg.MultiGraph
-        elif G.is_directed():
-            graph_class = nxcg.DiGraph
-        else:
-            graph_class = nxcg.Graph
-        if _nxver >= (3, 3) and not nx.config.backends.cugraph.use_compat_graphs:
-            graph_class = graph_class.to_cudagraph_class()
-        if G.__class__ not in {nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph}:
-            raise NotImplementedError(
-                f"create_using with type {type(G)} is not supported by the cugraph "
-                "backend; only standard networkx or nx_cugraph graph objects are "
-                "allowed (but not customized subclasses derived from them)."
-            )
-    else:
-        graph_class = G.__class__
-    return graph_class, inplace
-
-
-def _number_and_nodes(n_and_nodes):
-    n, nodes = n_and_nodes
-    try:
-        n = op.index(n)
-    except TypeError:
-        n = len(nodes)
-    if n < 0:
-        raise nx.NetworkXError(f"Negative number of nodes not valid: {n}")
-    if not isinstance(nodes, list):
-        nodes = list(nodes)
-    if not nodes:
-        return (n, None)
-    if nodes[0] == 0 and nodes[n - 1] == n - 1:
-        try:
-            if nodes == list(range(n)):
-                return (n, None)
-        except Exception:
-            pass
-    return (n, nodes)
diff --git a/python/nx-cugraph/nx_cugraph/generators/classic.py b/python/nx-cugraph/nx_cugraph/generators/classic.py
deleted file mode 100644
index cfcb2a3afec..00000000000
--- a/python/nx-cugraph/nx_cugraph/generators/classic.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-from numbers import Integral
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from ..utils import _get_int_dtype, index_dtype, networkx_algorithm
-from ._utils import (
-    _IS_NX32_OR_LESS,
-    _common_small_graph,
-    _complete_graph_indices,
-    _create_using_class,
-    _ensure_int,
-    _ensure_nonnegative_int,
-    _number_and_nodes,
-)
-
-__all__ = [
-    "barbell_graph",
-    "circular_ladder_graph",
-    "complete_graph",
-    "complete_multipartite_graph",
-    "cycle_graph",
-    "empty_graph",
-    "ladder_graph",
-    "lollipop_graph",
-    "null_graph",
-    "path_graph",
-    "star_graph",
-    "tadpole_graph",
-    "trivial_graph",
-    "turan_graph",
-    "wheel_graph",
-]
-
-concat = itertools.chain.from_iterable
-
-
-@networkx_algorithm(version_added="23.12")
-def barbell_graph(m1, m2, create_using=None):
-    # Like two complete graphs and a path_graph
-    m1 = _ensure_nonnegative_int(m1)
-    if m1 < 2:
-        raise nx.NetworkXError("Invalid graph description, m1 should be >=2")
-    m2 = _ensure_nonnegative_int(m2)
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    src_bell1, dst_bell1 = _complete_graph_indices(m1)
-    src_bell2 = src_bell1 + (m1 + m2)
-    dst_bell2 = dst_bell1 + (m1 + m2)
-    if m2 == 0:
-        src_bar = cp.array([m1 - 1, m1], index_dtype)
-        dst_bar = cp.array([m1, m1 - 1], index_dtype)
-    else:
-        src_bar = cp.arange(2 * m1 - 1, 2 * m1 + 2 * m2 + 1, dtype=index_dtype) // 2
-        dst_bar = (
-            cp.arange(m1 - 1, m1 + m2 + 1, dtype=index_dtype)[:, None]
-            + cp.array([-1, 1], index_dtype)
-        ).ravel()[1:-1]
-    src_indices = cp.hstack((src_bell1, src_bar, src_bell2))
-    dst_indices = cp.hstack((dst_bell1, dst_bar, dst_bell2))
-    G = graph_class.from_coo(2 * m1 + m2, src_indices, dst_indices)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def circular_ladder_graph(n, create_using=None):
-    return _ladder_graph(n, create_using, is_circular=True)
-
-
-@networkx_algorithm(nodes_or_number=0, version_added="23.12")
-def complete_graph(n, create_using=None):
-    n, nodes = _number_and_nodes(n)
-    if n < 3:
-        return _common_small_graph(n, nodes, create_using)
-    graph_class, inplace = _create_using_class(create_using)
-    src_indices, dst_indices = _complete_graph_indices(n)
-    G = graph_class.from_coo(n, src_indices, dst_indices, id_to_key=nodes)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def complete_multipartite_graph(*subset_sizes):
-    if not subset_sizes:
-        if _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs:
-            return nxcg.Graph()
-        return nxcg.CudaGraph()
-    try:
-        subset_sizes = [_ensure_int(size) for size in subset_sizes]
-    except TypeError:
-        subsets = [list(subset) for subset in subset_sizes]
-        subset_sizes = [len(subset) for subset in subsets]
-        nodes = list(concat(subsets))
-    else:
-        subsets = nodes = None
-        try:
-            subset_sizes = [_ensure_nonnegative_int(size) for size in subset_sizes]
-        except nx.NetworkXError:
-            if _IS_NX32_OR_LESS:
-                raise NotImplementedError("Negative number of nodes is not supported")
-            raise
-    L1 = []
-    L2 = []
-    total = 0
-    for size in subset_sizes:
-        all_indices = cp.indices((total, size), dtype=index_dtype)
-        L1.append(all_indices[0].ravel())
-        L2.append(all_indices[1].ravel() + total)
-        total += size
-    src_indices = cp.hstack(L1 + L2)
-    dst_indices = cp.hstack(L2 + L1)
-    subsets_array = cp.array(
-        np.repeat(
-            np.arange(len(subset_sizes), dtype=_get_int_dtype(len(subset_sizes) - 1)),
-            subset_sizes,
-        )
-    )
-    return nxcg.Graph.from_coo(
-        subsets_array.size,
-        src_indices,
-        dst_indices,
-        node_values={"subset": subsets_array},
-        id_to_key=nodes,
-        use_compat_graph=_nxver < (3, 3)
-        or nx.config.backends.cugraph.use_compat_graphs,
-    )
-
-
-@networkx_algorithm(nodes_or_number=0, version_added="23.12")
-def cycle_graph(n, create_using=None):
-    n, nodes = _number_and_nodes(n)
-    graph_class, inplace = _create_using_class(create_using)
-    if n == 1:
-        src_indices = cp.zeros(1, index_dtype)
-        dst_indices = cp.zeros(1, index_dtype)
-    elif n == 2 and graph_class.is_multigraph() and not graph_class.is_directed():
-        # This is kind of a peculiar edge case
-        src_indices = cp.array([0, 0, 1, 1], index_dtype)
-        dst_indices = cp.array([1, 1, 0, 0], index_dtype)
-    elif n < 3:
-        return _common_small_graph(n, nodes, create_using)
-    elif graph_class.is_directed():
-        src_indices = cp.arange(n, dtype=index_dtype)
-        dst_indices = cp.arange(1, n + 1, dtype=index_dtype)
-        dst_indices[-1] = 0
-    else:
-        src_indices = cp.arange(2 * n, dtype=index_dtype) // 2
-        dst_indices = (
-            cp.arange(n, dtype=index_dtype)[:, None] + cp.array([-1, 1], index_dtype)
-        ).ravel()
-        dst_indices[0] = n - 1
-        dst_indices[-1] = 0
-    G = graph_class.from_coo(n, src_indices, dst_indices, id_to_key=nodes)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(nodes_or_number=0, version_added="23.12")
-def empty_graph(n=0, create_using=None, default=nx.Graph):
-    n, nodes = _number_and_nodes(n)
-    graph_class, inplace = _create_using_class(create_using, default=default)
-    G = graph_class.from_coo(
-        n, cp.empty(0, index_dtype), cp.empty(0, index_dtype), id_to_key=nodes
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-def _ladder_graph(n, create_using, *, is_circular=False):
-    # Like path path_graph with extra arange, and middle link missing
-    n = _ensure_nonnegative_int(n)
-    if n < 2:
-        if not is_circular:
-            return _common_small_graph(2 * n, None, create_using, allow_directed=False)
-        graph_class, inplace = _create_using_class(create_using)
-        if graph_class.is_directed():
-            raise nx.NetworkXError("Directed Graph not supported")
-        if n == 1:
-            src_indices = cp.array([0, 1, 0, 1], index_dtype)
-            dst_indices = cp.array([0, 0, 1, 1], index_dtype)
-            nodes = None
-        elif graph_class.is_multigraph():
-            src_indices = cp.array([0, 0, 1, 1], index_dtype)
-            dst_indices = cp.array([1, 1, 0, 0], index_dtype)
-            nodes = [0, -1]
-        else:
-            src_indices = cp.array([0, 1], index_dtype)
-            dst_indices = cp.array([1, 0], index_dtype)
-            nodes = [0, -1]
-        G = graph_class.from_coo(2, src_indices, dst_indices, id_to_key=nodes)
-        if inplace:
-            return create_using._become(G)
-        return G
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    path_src = cp.arange(1, 2 * n - 1, dtype=index_dtype) // 2
-    path_dst = (
-        cp.arange(n, dtype=index_dtype)[:, None] + cp.array([-1, 1], index_dtype)
-    ).ravel()[1:-1]
-    srcs = [path_src, path_src + n, cp.arange(2 * n, dtype=index_dtype)]
-    dsts = [
-        path_dst,
-        path_dst + n,
-        cp.arange(n, 2 * n, dtype=index_dtype),
-        cp.arange(0, n, dtype=index_dtype),
-    ]
-    if is_circular and (n > 2 or graph_class.is_multigraph()):
-        srcs.append(cp.array([0, n - 1, n, 2 * n - 1], index_dtype))
-        dsts.append(cp.array([n - 1, 0, 2 * n - 1, n], index_dtype))
-    src_indices = cp.hstack(srcs)
-    dst_indices = cp.hstack(dsts)
-    G = graph_class.from_coo(2 * n, src_indices, dst_indices)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def ladder_graph(n, create_using=None):
-    return _ladder_graph(n, create_using)
-
-
-@networkx_algorithm(nodes_or_number=[0, 1], version_added="23.12")
-def lollipop_graph(m, n, create_using=None):
-    # Like complete_graph then path_graph
-    orig_m, unused_nodes_m = m
-    orig_n, unused_nodes_n = n
-    m, m_nodes = _number_and_nodes(m)
-    if m < 2:
-        raise nx.NetworkXError(
-            "Invalid description: m should indicate at least 2 nodes"
-        )
-    n, n_nodes = _number_and_nodes(n)
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    msrc_indices, mdst_indices = _complete_graph_indices(m)
-    nsrc_indices = cp.arange(2 * m - 1, 2 * m + 2 * n - 1, dtype=index_dtype) // 2
-    ndst_indices = (
-        cp.arange(m - 1, m + n, dtype=index_dtype)[:, None]
-        + cp.array([-1, 1], index_dtype)
-    ).ravel()[1:-1]
-    src_indices = cp.hstack((msrc_indices, nsrc_indices))
-    dst_indices = cp.hstack((mdst_indices, ndst_indices))
-    if isinstance(orig_m, Integral) and isinstance(orig_n, Integral):
-        nodes = None
-    else:
-        nodes = list(range(m)) if m_nodes is None else m_nodes
-        nodes.extend(range(n) if n_nodes is None else n_nodes)
-        if len(set(nodes)) != len(nodes):
-            raise nx.NetworkXError("Nodes must be distinct in containers m and n")
-    G = graph_class.from_coo(m + n, src_indices, dst_indices, id_to_key=nodes)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def null_graph(create_using=None):
-    return _common_small_graph(0, None, create_using)
-
-
-@networkx_algorithm(nodes_or_number=0, version_added="23.12")
-def path_graph(n, create_using=None):
-    n, nodes = _number_and_nodes(n)
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        src_indices = cp.arange(n - 1, dtype=index_dtype)
-        dst_indices = cp.arange(1, n, dtype=index_dtype)
-    elif n < 3:
-        return _common_small_graph(n, nodes, create_using)
-    else:
-        src_indices = cp.arange(1, 2 * n - 1, dtype=index_dtype) // 2
-        dst_indices = (
-            cp.arange(n, dtype=index_dtype)[:, None] + cp.array([-1, 1], index_dtype)
-        ).ravel()[1:-1]
-    G = graph_class.from_coo(n, src_indices, dst_indices, id_to_key=nodes)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(nodes_or_number=0, version_added="23.12")
-def star_graph(n, create_using=None):
-    orig_n, orig_nodes = n
-    n, nodes = _number_and_nodes(n)
-    # star_graph behaves differently whether the input was an int or iterable
-    if isinstance(orig_n, Integral):
-        if nodes is not None:
-            nodes.append(n)
-        n += 1
-    if n < 3:
-        return _common_small_graph(n, nodes, create_using, allow_directed=False)
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    flat = cp.zeros(n - 1, index_dtype)
-    ramp = cp.arange(1, n, dtype=index_dtype)
-    src_indices = cp.hstack((flat, ramp))
-    dst_indices = cp.hstack((ramp, flat))
-    G = graph_class.from_coo(n, src_indices, dst_indices, id_to_key=nodes)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(nodes_or_number=[0, 1], version_added="23.12")
-def tadpole_graph(m, n, create_using=None):
-    orig_m, unused_nodes_m = m
-    orig_n, unused_nodes_n = n
-    m, m_nodes = _number_and_nodes(m)
-    if m < 2:
-        raise nx.NetworkXError(
-            "Invalid description: m should indicate at least 2 nodes"
-        )
-    n, n_nodes = _number_and_nodes(n)
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    if isinstance(orig_m, Integral) and isinstance(orig_n, Integral):
-        nodes = None
-    else:
-        nodes = list(range(m)) if m_nodes is None else m_nodes
-        nodes.extend(range(n) if n_nodes is None else n_nodes)
-    if m == 2 and not graph_class.is_multigraph():
-        src_indices = cp.arange(1, 2 * (m + n) - 1, dtype=index_dtype) // 2
-        dst_indices = (
-            cp.arange((m + n), dtype=index_dtype)[:, None]
-            + cp.array([-1, 1], index_dtype)
-        ).ravel()[1:-1]
-    else:
-        src_indices = cp.arange(2 * (m + n), dtype=index_dtype) // 2
-        dst_indices = (
-            cp.arange((m + n), dtype=index_dtype)[:, None]
-            + cp.array([-1, 1], index_dtype)
-        ).ravel()
-        dst_indices[0] = m - 1
-        dst_indices[-1] = 0
-    G = graph_class.from_coo(m + n, src_indices, dst_indices, id_to_key=nodes)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def trivial_graph(create_using=None):
-    return _common_small_graph(1, None, create_using)
-
-
-@networkx_algorithm(version_added="23.12")
-def turan_graph(n, r):
-    if not 1 <= r <= n:
-        raise nx.NetworkXError("Must satisfy 1 <= r <= n")
-    n_div_r, n_mod_r = divmod(n, r)
-    partitions = [n_div_r] * (r - n_mod_r) + [n_div_r + 1] * n_mod_r
-    return complete_multipartite_graph(*partitions)
-
-
-@networkx_algorithm(nodes_or_number=0, version_added="23.12")
-def wheel_graph(n, create_using=None):
-    n, nodes = _number_and_nodes(n)
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    if n < 2:
-        G = graph_class.from_coo(
-            n, cp.empty(0, index_dtype), cp.empty(0, index_dtype), id_to_key=nodes
-        )
-    else:
-        # Like star_graph
-        flat = cp.zeros(n - 1, index_dtype)
-        ramp = cp.arange(1, n, dtype=index_dtype)
-        # Like cycle_graph
-        if n < 3:
-            src_indices = cp.empty(0, index_dtype)
-            dst_indices = cp.empty(0, index_dtype)
-        elif n > 3:
-            src_indices = cp.arange(2, 2 * n, dtype=index_dtype) // 2
-            dst_indices = (
-                cp.arange(1, n, dtype=index_dtype)[:, None]
-                + cp.array([-1, 1], index_dtype)
-            ).ravel()
-            dst_indices[-1] = 1
-            dst_indices[0] = n - 1
-        elif graph_class.is_multigraph():
-            src_indices = cp.array([1, 1, 2, 2], index_dtype)
-            dst_indices = cp.array([2, 2, 1, 1], index_dtype)
-        else:
-            src_indices = cp.array([1, 2], index_dtype)
-            dst_indices = cp.array([2, 1], index_dtype)
-        src_indices = cp.hstack((flat, ramp, src_indices))
-        dst_indices = cp.hstack((ramp, flat, dst_indices))
-        G = graph_class.from_coo(n, src_indices, dst_indices, id_to_key=nodes)
-    if inplace:
-        return create_using._become(G)
-    return G
diff --git a/python/nx-cugraph/nx_cugraph/generators/community.py b/python/nx-cugraph/nx_cugraph/generators/community.py
deleted file mode 100644
index 4e5063cc345..00000000000
--- a/python/nx-cugraph/nx_cugraph/generators/community.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from ..utils import networkx_algorithm
-from ._utils import (
-    _common_small_graph,
-    _complete_graph_indices,
-    _ensure_int,
-    _ensure_nonnegative_int,
-)
-
-__all__ = [
-    "caveman_graph",
-]
-
-
-@networkx_algorithm(version_added="23.12")
-def caveman_graph(l, k):  # noqa: E741
-    l = _ensure_int(l)  # noqa: E741
-    k = _ensure_int(k)
-    N = _ensure_nonnegative_int(k * l)
-    if l == 0 or k < 1:
-        return _common_small_graph(N, None, None)
-    k = _ensure_nonnegative_int(k)
-    src_clique, dst_clique = _complete_graph_indices(k)
-    src_cliques = [src_clique]
-    dst_cliques = [dst_clique]
-    src_cliques.extend(src_clique + i * k for i in range(1, l))
-    dst_cliques.extend(dst_clique + i * k for i in range(1, l))
-    src_indices = cp.hstack(src_cliques)
-    dst_indices = cp.hstack(dst_cliques)
-    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-    return nxcg.CudaGraph.from_coo(
-        l * k, src_indices, dst_indices, use_compat_graph=use_compat_graph
-    )
diff --git a/python/nx-cugraph/nx_cugraph/generators/ego.py b/python/nx-cugraph/nx_cugraph/generators/ego.py
deleted file mode 100644
index 9a91fa0b6c3..00000000000
--- a/python/nx-cugraph/nx_cugraph/generators/ego.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-import pylibcugraph as plc
-
-import nx_cugraph as nxcg
-
-from ..utils import _dtype_param, _get_float_dtype, index_dtype, networkx_algorithm
-
-__all__ = ["ego_graph"]
-
-
-@networkx_algorithm(
-    extra_params=_dtype_param, version_added="24.06", _plc={"bfs", "ego_graph", "sssp"}
-)
-def ego_graph(
-    G, n, radius=1, center=True, undirected=False, distance=None, *, dtype=None
-):
-    """Weighted ego_graph with negative cycles is not yet supported. `NotImplementedError` will be raised if there are negative `distance` edge weights."""  # noqa: E501
-    if isinstance(G, nx.Graph):
-        is_compat_graph = isinstance(G, nxcg.Graph)
-        G = nxcg.from_networkx(G, preserve_all_attrs=True)
-    else:
-        is_compat_graph = False
-    if n not in G:
-        if distance is None:
-            raise nx.NodeNotFound(f"Source {n} is not in G")
-        raise nx.NodeNotFound(f"Node {n} not found in graph")
-    src_index = n if G.key_to_id is None else G.key_to_id[n]
-    symmetrize = "union" if undirected and G.is_directed() else None
-    if distance is None or distance not in G.edge_values:
-        # Simple BFS to determine nodes
-        if radius is not None and radius <= 0:
-            if center:
-                node_ids = cp.array([src_index], dtype=index_dtype)
-            else:
-                node_ids = cp.empty(0, dtype=index_dtype)
-            node_mask = None
-        else:
-            if radius is None or np.isinf(radius):
-                radius = -1
-            else:
-                radius = math.ceil(radius)
-            distances, unused_predecessors, node_ids = plc.bfs(
-                handle=plc.ResourceHandle(),
-                graph=G._get_plc_graph(symmetrize=symmetrize),
-                sources=cp.array([src_index], index_dtype),
-                direction_optimizing=False,  # True for undirected only; what's best?
-                depth_limit=radius,
-                compute_predecessors=False,
-                do_expensive_check=False,
-            )
-            node_mask = distances != np.iinfo(distances.dtype).max
-    else:
-        # SSSP to determine nodes
-        if callable(distance):
-            raise NotImplementedError("callable `distance` argument is not supported")
-        if symmetrize and G.is_multigraph():
-            # G._get_plc_graph does not implement `symmetrize=True` w/ edge array
-            raise NotImplementedError(
-                "Weighted ego_graph with undirected=True not implemented"
-            )
-        # Check for negative values since we don't support negative cycles
-        edge_vals = G.edge_values[distance]
-        if distance in G.edge_masks:
-            edge_vals = edge_vals[G.edge_masks[distance]]
-        if (edge_vals < 0).any():
-            raise NotImplementedError(
-                "Negative edge weights not yet supported by ego_graph"
-            )
-        # PERF: we could use BFS if all edges are equal
-        if radius is None:
-            radius = np.inf
-        dtype = _get_float_dtype(dtype, graph=G, weight=distance)
-        node_ids, distances, unused_predecessors = plc.sssp(
-            resource_handle=plc.ResourceHandle(),
-            graph=(G.to_undirected() if symmetrize else G)._get_plc_graph(
-                distance, 1, dtype
-            ),
-            source=src_index,
-            cutoff=np.nextafter(radius, np.inf, dtype=np.float64),
-            compute_predecessors=True,  # TODO: False is not yet supported
-            do_expensive_check=False,
-        )
-        node_mask = distances != np.finfo(distances.dtype).max
-
-    if node_mask is not None:
-        if not center:
-            node_mask &= node_ids != src_index
-        node_ids = node_ids[node_mask]
-    if node_ids.size == G._N:
-        rv = G.copy()
-        if is_compat_graph:
-            return rv._to_compat_graph()
-        return rv
-    # TODO: create renumbering helper function(s)
-    node_ids.sort()  # TODO: is this ever necessary? Keep for safety
-    node_values = {key: val[node_ids] for key, val in G.node_values.items()}
-    node_masks = {key: val[node_ids] for key, val in G.node_masks.items()}
-
-    G._sort_edge_indices()  # TODO: is this ever necessary? Keep for safety
-    edge_mask = cp.isin(G.src_indices, node_ids) & cp.isin(G.dst_indices, node_ids)
-    src_indices = cp.searchsorted(node_ids, G.src_indices[edge_mask]).astype(
-        index_dtype
-    )
-    dst_indices = cp.searchsorted(node_ids, G.dst_indices[edge_mask]).astype(
-        index_dtype
-    )
-    edge_values = {key: val[edge_mask] for key, val in G.edge_values.items()}
-    edge_masks = {key: val[edge_mask] for key, val in G.edge_masks.items()}
-
-    # Renumber nodes
-    if (id_to_key := G.id_to_key) is not None:
-        key_to_id = {
-            id_to_key[old_index]: new_index
-            for new_index, old_index in enumerate(node_ids.tolist())
-        }
-    else:
-        key_to_id = {
-            old_index: new_index
-            for new_index, old_index in enumerate(node_ids.tolist())
-        }
-    kwargs = {
-        "N": node_ids.size,
-        "src_indices": src_indices,
-        "dst_indices": dst_indices,
-        "edge_values": edge_values,
-        "edge_masks": edge_masks,
-        "node_values": node_values,
-        "node_masks": node_masks,
-        "key_to_id": key_to_id,
-        "use_compat_graph": False,
-    }
-    if G.is_multigraph():
-        if G.edge_keys is not None:
-            kwargs["edge_keys"] = [
-                x for x, m in zip(G.edge_keys, edge_mask.tolist()) if m
-            ]
-        if G.edge_indices is not None:
-            kwargs["edge_indices"] = G.edge_indices[edge_mask]
-    rv = G.__class__.from_coo(**kwargs)
-    rv.graph.update(G.graph)
-    if is_compat_graph:
-        return rv._to_compat_graph()
-    return rv
-
-
-@ego_graph._can_run
-def _(G, n, radius=1, center=True, undirected=False, distance=None, *, dtype=None):
-    if distance is not None and undirected and G.is_directed() and G.is_multigraph():
-        return "Weighted ego_graph with undirected=True not implemented"
-    if distance is not None and nx.is_negatively_weighted(G, weight=distance):
-        return "Weighted ego_graph with negative cycles not yet supported"
-    if callable(distance):
-        return "callable `distance` argument is not supported"
-    return True
diff --git a/python/nx-cugraph/nx_cugraph/generators/small.py b/python/nx-cugraph/nx_cugraph/generators/small.py
deleted file mode 100644
index d0c03cb7dd4..00000000000
--- a/python/nx-cugraph/nx_cugraph/generators/small.py
+++ /dev/null
@@ -1,630 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from ..utils import index_dtype, networkx_algorithm
-from ._utils import _IS_NX32_OR_LESS, _create_using_class
-
-__all__ = [
-    "bull_graph",
-    "chvatal_graph",
-    "cubical_graph",
-    "desargues_graph",
-    "diamond_graph",
-    "dodecahedral_graph",
-    "frucht_graph",
-    "heawood_graph",
-    "house_graph",
-    "house_x_graph",
-    "icosahedral_graph",
-    "krackhardt_kite_graph",
-    "moebius_kantor_graph",
-    "octahedral_graph",
-    "pappus_graph",
-    "petersen_graph",
-    "sedgewick_maze_graph",
-    "tetrahedral_graph",
-    "truncated_cube_graph",
-    "truncated_tetrahedron_graph",
-    "tutte_graph",
-]
-
-
-@networkx_algorithm(version_added="23.12")
-def bull_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    src_indices = cp.array([0, 0, 1, 1, 1, 2, 2, 2, 3, 4], index_dtype)
-    dst_indices = cp.array([1, 2, 0, 2, 3, 0, 1, 4, 1, 2], index_dtype)
-    G = graph_class.from_coo(5, src_indices, dst_indices, name="Bull Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def chvatal_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
-            6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11,
-            11, 11,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 4, 6, 9, 0, 2, 5, 7, 1, 3, 6, 8, 2, 4, 7, 9, 0, 3, 5, 8, 1, 4, 10, 11,
-            0, 2, 10, 11, 1, 3, 8, 11, 2, 4, 7, 10, 0, 3, 10, 11, 5, 6, 8, 9, 5, 6,
-            7, 9,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    G = graph_class.from_coo(12, src_indices, dst_indices, name="Chvatal Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def cubical_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    src_indices = cp.array(
-        [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [1, 3, 4, 0, 2, 7, 1, 3, 6, 0, 2, 5, 0, 5, 7, 3, 4, 6, 2, 5, 7, 1, 4, 6],
-        index_dtype,
-    )
-    name = ("Platonic Cubical Graph",) if _IS_NX32_OR_LESS else "Platonic Cubical Graph"
-    G = graph_class.from_coo(8, src_indices, dst_indices, name=name)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def desargues_graph(create_using=None):
-    # This can also be defined w.r.t. LCF_graph
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14,
-            14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 5, 19, 0, 2, 16, 1, 3, 11, 2, 4, 14, 3, 5, 9, 0, 4, 6, 5, 7, 15, 6, 8,
-            18, 7, 9, 13, 4, 8, 10, 9, 11, 19, 2, 10, 12, 11, 13, 17, 8, 12, 14, 3,
-            13, 15, 6, 14, 16, 1, 15, 17, 12, 16, 18, 7, 17, 19, 0, 10, 18,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    if graph_class.is_multigraph():
-        src_indices_extra = cp.array(
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
-            index_dtype,
-        )
-        dst_indices_extra = cp.array(
-            [5, 16, 11, 14, 9, 0, 15, 18, 13, 4, 19, 2, 17, 8, 3, 6, 1, 12, 7, 10],
-            index_dtype,
-        )
-        src_indices = cp.hstack((src_indices, src_indices_extra))
-        dst_indices = cp.hstack((dst_indices, dst_indices_extra))
-    G = graph_class.from_coo(20, src_indices, dst_indices, name="Desargues Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def diamond_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    src_indices = cp.array([0, 0, 1, 1, 1, 2, 2, 2, 3, 3], index_dtype)
-    dst_indices = cp.array([1, 2, 0, 2, 3, 0, 1, 3, 1, 2], index_dtype)
-    G = graph_class.from_coo(4, src_indices, dst_indices, name="Diamond Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def dodecahedral_graph(create_using=None):
-    # This can also be defined w.r.t. LCF_graph
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14,
-            14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 10, 19, 0, 2, 8, 1, 3, 6, 2, 4, 19, 3, 5, 17, 4, 6, 15, 2, 5, 7, 6, 8,
-            14, 1, 7, 9, 8, 10, 13, 0, 9, 11, 10, 12, 18, 11, 13, 16, 9, 12, 14, 7,
-            13, 15, 5, 14, 16, 12, 15, 17, 4, 16, 18, 11, 17, 19, 0, 3, 18,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    if graph_class.is_multigraph():
-        src_indices_extra = cp.array(
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
-            index_dtype,
-        )
-        dst_indices_extra = cp.array(
-            [10, 8, 6, 19, 17, 15, 2, 14, 1, 13, 0, 18, 16, 9, 7, 5, 12, 4, 11, 3],
-            index_dtype,
-        )
-        src_indices = cp.hstack((src_indices, src_indices_extra))
-        dst_indices = cp.hstack((dst_indices, dst_indices_extra))
-    G = graph_class.from_coo(20, src_indices, dst_indices, name="Dodecahedral Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def frucht_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        src_indices = cp.array(
-            [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 8, 8, 10],
-            index_dtype,
-        )
-        dst_indices = cp.array(
-            [1, 7, 2, 7, 3, 8, 4, 9, 5, 9, 6, 10, 0, 10, 11, 9, 11, 11],
-            index_dtype,
-        )
-    else:
-        # fmt: off
-        src_indices = cp.array(
-            [
-                0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7,
-                7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11,
-            ],
-            index_dtype,
-        )
-        dst_indices = cp.array(
-            [
-                1, 6, 7, 0, 2, 7, 1, 3, 8, 2, 4, 9, 3, 5, 9, 4, 6, 10, 0, 5, 10, 0,
-                1, 11, 2, 9, 11, 3, 4, 8, 5, 6, 11, 7, 8, 10,
-            ],
-            index_dtype,
-        )
-        # fmt: on
-    G = graph_class.from_coo(12, src_indices, dst_indices, name="Frucht Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def heawood_graph(create_using=None):
-    # This can also be defined w.r.t. LCF_graph
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 5, 13, 0, 2, 10, 1, 3, 7, 2, 4, 12, 3, 5, 9, 0, 4, 6, 5, 7, 11, 2, 6,
-            8, 7, 9, 13, 4, 8, 10, 1, 9, 11, 6, 10, 12, 3, 11, 13, 0, 8, 12,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    if graph_class.is_multigraph():
-        src_indices_extra = cp.array(
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
-            index_dtype,
-        )
-        dst_indices_extra = cp.array(
-            [5, 10, 7, 12, 9, 0, 11, 2, 13, 4, 1, 6, 3, 8],
-            index_dtype,
-        )
-        src_indices = cp.hstack((src_indices, src_indices_extra))
-        dst_indices = cp.hstack((dst_indices, dst_indices_extra))
-    G = graph_class.from_coo(14, src_indices, dst_indices, name="Heawood Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def house_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    src_indices = cp.array([0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4], index_dtype)
-    dst_indices = cp.array([1, 2, 0, 3, 0, 3, 4, 1, 2, 4, 2, 3], index_dtype)
-    G = graph_class.from_coo(5, src_indices, dst_indices, name="House Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def house_x_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    src_indices = cp.array(
-        [0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4], index_dtype
-    )
-    dst_indices = cp.array(
-        [1, 2, 3, 0, 2, 3, 0, 1, 3, 4, 0, 1, 2, 4, 2, 3], index_dtype
-    )
-    G = graph_class.from_coo(
-        5, src_indices, dst_indices, name="House-with-X-inside Graph"
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def icosahedral_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
-            4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9,
-            9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 5, 7, 8, 11, 0, 2, 5, 6, 8, 1, 3, 6, 8, 9, 2, 4, 6, 9, 10, 3, 5, 6,
-            10, 11, 0, 1, 4, 6, 11, 1, 2, 3, 4, 5, 0, 8, 9, 10, 11, 0, 1, 2, 7, 9, 2,
-            3, 7, 8, 10, 3, 4, 7, 9, 11, 0, 4, 5, 7, 10,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    G = graph_class.from_coo(
-        12, src_indices, dst_indices, name="Platonic Icosahedral Graph"
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def krackhardt_kite_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5,
-            5, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 2, 3, 5, 0, 3, 4, 6, 0, 3, 5, 0, 1, 2, 4, 5, 6, 1, 3, 6, 0, 2, 3, 6,
-            7, 1, 3, 4, 5, 7, 5, 6, 8, 7, 9, 8,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    G = graph_class.from_coo(
-        10, src_indices, dst_indices, name="Krackhardt Kite Social Network"
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def moebius_kantor_graph(create_using=None):
-    # This can also be defined w.r.t. LCF_graph
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14,
-            14, 14, 15, 15, 15,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 5, 15, 0, 2, 12, 1, 3, 7, 2, 4, 14, 3, 5, 9, 0, 4, 6, 5, 7, 11, 2, 6,
-            8, 7, 9, 13, 4, 8, 10, 9, 11, 15, 6, 10, 12, 1, 11, 13, 8, 12, 14, 3, 13,
-            15, 0, 10, 14,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    if graph_class.is_multigraph():
-        src_indices_extra = cp.array(
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-            index_dtype,
-        )
-        dst_indices_extra = cp.array(
-            [5, 12, 7, 14, 9, 0, 11, 2, 13, 4, 15, 6, 1, 8, 3, 10],
-            index_dtype,
-        )
-        src_indices = cp.hstack((src_indices, src_indices_extra))
-        dst_indices = cp.hstack((dst_indices, dst_indices_extra))
-    G = graph_class.from_coo(16, src_indices, dst_indices, name="Moebius-Kantor Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def octahedral_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    src_indices = cp.array(
-        [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 4, 5, 0, 1, 4, 5, 0, 2, 3, 5, 1, 2, 3, 4],
-        index_dtype,
-    )
-    G = graph_class.from_coo(
-        6, src_indices, dst_indices, name="Platonic Octahedral Graph"
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def pappus_graph():
-    # This can also be defined w.r.t. LCF_graph
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14,
-            14, 15, 15, 15, 16, 16, 16, 17, 17, 17,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 5, 17, 0, 2, 8, 1, 3, 13, 2, 4, 10, 3, 5, 15, 0, 4, 6, 5, 7, 11, 6, 8,
-            14, 1, 7, 9, 8, 10, 16, 3, 9, 11, 6, 10, 12, 11, 13, 17, 2, 12, 14, 7,
-            13, 15, 4, 14, 16, 9, 15, 17, 0, 12, 16,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-    return nxcg.CudaGraph.from_coo(
-        18,
-        src_indices,
-        dst_indices,
-        name="Pappus Graph",
-        use_compat_graph=use_compat_graph,
-    )
-
-
-@networkx_algorithm(version_added="23.12")
-def petersen_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 4, 5, 0, 2, 6, 1, 3, 7, 2, 4, 8, 0, 3, 9, 0, 7, 8, 1, 8, 9, 2, 5, 9,
-            3, 5, 6, 4, 6, 7,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    G = graph_class.from_coo(10, src_indices, dst_indices, name="Petersen Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def sedgewick_maze_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        src_indices = cp.array([0, 0, 0, 1, 2, 3, 3, 4, 4, 4], index_dtype)
-        dst_indices = cp.array([2, 5, 7, 7, 6, 4, 5, 5, 6, 7], index_dtype)
-    else:
-        src_indices = cp.array(
-            [0, 0, 0, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7],
-            index_dtype,
-        )
-        dst_indices = cp.array(
-            [2, 5, 7, 7, 0, 6, 4, 5, 3, 5, 6, 7, 0, 3, 4, 2, 4, 0, 1, 4],
-            index_dtype,
-        )
-    G = graph_class.from_coo(8, src_indices, dst_indices, name="Sedgewick Maze")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def tetrahedral_graph(create_using=None):
-    # This can also be defined w.r.t. complete_graph
-    graph_class, inplace = _create_using_class(create_using)
-    src_indices = cp.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], index_dtype)
-    dst_indices = cp.array([1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2], index_dtype)
-    name = (
-        "Platonic Tetrahedral graph"
-        if _IS_NX32_OR_LESS
-        else "Platonic Tetrahedral Graph"
-    )
-    G = graph_class.from_coo(4, src_indices, dst_indices, name=name)
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def truncated_cube_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14,
-            14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20,
-            20, 21, 21, 21, 22, 22, 22, 23, 23, 23,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 2, 4, 0, 11, 14, 0, 3, 4, 2, 6, 8, 0, 2, 5, 4, 16, 18, 3, 7, 8, 6, 10,
-            12, 3, 6, 9, 8, 17, 20, 7, 11, 12, 1, 10, 14, 7, 10, 13, 12, 21, 22, 1,
-            11, 15, 14, 19, 23, 5, 17, 18, 9, 16, 20, 5, 16, 19, 15, 18, 23, 9, 17,
-            21, 13, 20, 22, 13, 21, 23, 15, 19, 22,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    G = graph_class.from_coo(24, src_indices, dst_indices, name="Truncated Cube Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def truncated_tetrahedron_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        src_indices = cp.array(
-            [0, 0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 9, 10], index_dtype
-        )
-        dst_indices = cp.array(
-            [1, 2, 9, 2, 6, 3, 4, 11, 5, 11, 6, 7, 7, 8, 9, 10, 10, 11], index_dtype
-        )
-    else:
-        # fmt: off
-        src_indices = cp.array(
-            [
-                0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7,
-                7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11,
-            ],
-            index_dtype,
-        )
-        dst_indices = cp.array(
-            [
-                1, 2, 9, 0, 2, 6, 0, 1, 3, 2, 4, 11, 3, 5, 11, 4, 6, 7, 1, 5, 7, 5,
-                6, 8, 7, 9, 10, 0, 8, 10, 8, 9, 11, 3, 4, 10,
-            ],
-            index_dtype,
-        )
-        # fmt: on
-    G = graph_class.from_coo(
-        12, src_indices, dst_indices, name="Truncated Tetrahedron Graph"
-    )
-    if inplace:
-        return create_using._become(G)
-    return G
-
-
-@networkx_algorithm(version_added="23.12")
-def tutte_graph(create_using=None):
-    graph_class, inplace = _create_using_class(create_using)
-    if graph_class.is_directed():
-        raise nx.NetworkXError("Directed Graph not supported")
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
-            8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14,
-            14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20,
-            20, 21, 21, 21, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26,
-            26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32,
-            32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38,
-            38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44,
-            44, 45, 45, 45,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 2, 3, 0, 4, 26, 0, 10, 11, 0, 18, 19, 1, 5, 33, 4, 6, 29, 5, 7, 27, 6,
-            8, 14, 7, 9, 38, 8, 10, 37, 2, 9, 39, 2, 12, 39, 11, 13, 35, 12, 14, 15,
-            7, 13, 34, 13, 16, 22, 15, 17, 44, 16, 18, 43, 3, 17, 45, 3, 20, 45, 19,
-            21, 41, 20, 22, 23, 15, 21, 40, 21, 24, 27, 23, 25, 32, 24, 26, 31, 1,
-            25, 33, 6, 23, 28, 27, 29, 32, 5, 28, 30, 29, 31, 33, 25, 30, 32, 24, 28,
-            31, 4, 26, 30, 14, 35, 38, 12, 34, 36, 35, 37, 39, 9, 36, 38, 8, 34, 37,
-            10, 11, 36, 22, 41, 44, 20, 40, 42, 41, 43, 45, 17, 42, 44, 16, 40, 43,
-            18, 19, 42,
-        ],
-        index_dtype,
-    )
-    # fmt: on
-    G = graph_class.from_coo(46, src_indices, dst_indices, name="Tutte's Graph")
-    if inplace:
-        return create_using._become(G)
-    return G
diff --git a/python/nx-cugraph/nx_cugraph/generators/social.py b/python/nx-cugraph/nx_cugraph/generators/social.py
deleted file mode 100644
index 09d405e7561..00000000000
--- a/python/nx-cugraph/nx_cugraph/generators/social.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from ..utils import index_dtype, networkx_algorithm
-
-__all__ = [
-    "davis_southern_women_graph",
-    "florentine_families_graph",
-    "karate_club_graph",
-    "les_miserables_graph",
-]
-
-
-@networkx_algorithm(version_added="23.12")
-def davis_southern_women_graph():
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
-            3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8,
-            8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12,
-            12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15,
-            16, 16, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 21, 21,
-            21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
-            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25,
-            25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-            27, 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30,
-            31, 31, 31,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            18, 19, 20, 21, 22, 23, 25, 26, 18, 19, 20, 22, 23, 24, 25, 19, 20, 21,
-            22, 23, 24, 25, 26, 18, 20, 21, 22, 23, 24, 25, 20, 21, 22, 24, 20, 22,
-            23, 25, 22, 23, 24, 25, 23, 25, 26, 22, 24, 25, 26, 24, 25, 26, 29, 25,
-            26, 27, 29, 25, 26, 27, 29, 30, 31, 24, 25, 26, 27, 29, 30, 31, 23, 24,
-            26, 27, 28, 29, 30, 31, 24, 25, 27, 28, 29, 25, 26, 26, 28, 26, 28, 0, 1,
-            3, 0, 1, 2, 0, 1, 2, 3, 4, 5, 0, 2, 3, 4, 0, 1, 2, 3, 4, 5, 6, 8, 0, 1,
-            2, 3, 5, 6, 7, 13, 1, 2, 3, 4, 6, 8, 9, 12, 13, 14, 0, 1, 2, 3, 5, 6, 7,
-            8, 9, 10, 11, 12, 14, 15, 0, 2, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 10,
-            11, 12, 13, 14, 13, 14, 16, 17, 9, 10, 11, 12, 13, 14, 11, 12, 13, 11,
-            12, 13,
-        ],
-        index_dtype,
-    )
-    bipartite = cp.array(
-        [
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1,
-        ],
-        np.int8,
-    )
-    women = [
-        "Evelyn Jefferson", "Laura Mandeville", "Theresa Anderson", "Brenda Rogers",
-        "Charlotte McDowd", "Frances Anderson", "Eleanor Nye", "Pearl Oglethorpe",
-        "Ruth DeSand", "Verne Sanderson", "Myra Liddel", "Katherina Rogers",
-        "Sylvia Avondale", "Nora Fayette", "Helen Lloyd", "Dorothy Murchison",
-        "Olivia Carleton", "Flora Price",
-    ]
-    events = [
-        "E1", "E2", "E3", "E4", "E5", "E6", "E7", "E8", "E9", "E10", "E11", "E12",
-        "E13", "E14",
-    ]
-    # fmt: on
-    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-    return nxcg.CudaGraph.from_coo(
-        32,
-        src_indices,
-        dst_indices,
-        node_values={"bipartite": bipartite},
-        id_to_key=women + events,
-        top=women,
-        bottom=events,
-        use_compat_graph=use_compat_graph,
-    )
-
-
-@networkx_algorithm(version_added="23.12")
-def florentine_families_graph():
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8,
-            9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            8, 5, 6, 8, 4, 8, 6, 10, 13, 2, 10, 13, 1, 1, 3, 7, 14, 6, 0, 1, 2, 11,
-            12, 14, 12, 3, 4, 13, 8, 13, 14, 8, 9, 3, 4, 10, 11, 6, 8, 11,
-        ],
-        index_dtype,
-    )
-    nodes = [
-        "Acciaiuoli", "Albizzi", "Barbadori", "Bischeri", "Castellani", "Ginori",
-        "Guadagni", "Lamberteschi", "Medici", "Pazzi", "Peruzzi", "Ridolfi",
-        "Salviati", "Strozzi", "Tornabuoni"
-    ]
-    # fmt: on
-    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-    return nxcg.CudaGraph.from_coo(
-        15,
-        src_indices,
-        dst_indices,
-        id_to_key=nodes,
-        use_compat_graph=use_compat_graph,
-    )
-
-
-@networkx_algorithm(version_added="23.12")
-def karate_club_graph():
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5,
-            6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 10, 10, 10, 11, 12, 12, 13,
-            13, 13, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 20,
-            20, 21, 21, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26,
-            27, 27, 27, 27, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31,
-            31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-            33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13,
-            17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0,
-            6, 10, 0, 6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33,
-            0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0,
-            1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31,
-            29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24,
-            25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13,
-            14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32,
-        ],
-        index_dtype,
-    )
-    weights = cp.array(
-        [
-            4, 5, 3, 3, 3, 3, 2, 2, 2, 3, 1, 3, 2, 2, 2, 2, 4, 6, 3, 4, 5, 1, 2, 2,
-            2, 5, 6, 3, 4, 5, 1, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 5, 3, 3,
-            3, 2, 5, 3, 2, 4, 4, 3, 2, 5, 3, 3, 4, 1, 2, 2, 3, 3, 3, 1, 3, 3, 5, 3,
-            3, 3, 3, 2, 3, 4, 3, 3, 2, 1, 1, 2, 2, 2, 1, 3, 1, 2, 2, 2, 3, 5, 4, 3,
-            5, 4, 2, 3, 2, 5, 2, 7, 4, 2, 2, 4, 3, 4, 2, 2, 2, 3, 4, 4, 2, 2, 3, 3,
-            3, 2, 2, 7, 2, 4, 4, 2, 3, 3, 3, 1, 3, 2, 5, 4, 3, 4, 5, 4, 2, 3, 2, 4,
-            2, 1, 1, 3, 4, 2, 4, 2, 2, 3, 4, 5,
-        ],
-        np.int8,
-    )
-    # For now, cupy doesn't handle str dtypes and we primarily handle cupy arrays.
-    # We try to support numpy arrays for node values, so let's use numpy here.
-    clubs = np.array([
-        "Mr. Hi", "Mr. Hi", "Mr. Hi", "Mr. Hi", "Mr. Hi", "Mr. Hi", "Mr. Hi",
-        "Mr. Hi", "Mr. Hi", "Officer", "Mr. Hi", "Mr. Hi", "Mr. Hi", "Mr. Hi",
-        "Officer", "Officer", "Mr. Hi", "Mr. Hi", "Officer", "Mr. Hi", "Officer",
-        "Mr. Hi", "Officer", "Officer", "Officer", "Officer", "Officer", "Officer",
-        "Officer", "Officer", "Officer", "Officer", "Officer", "Officer",
-    ])
-    # fmt: on
-    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-    return nxcg.CudaGraph.from_coo(
-        34,
-        src_indices,
-        dst_indices,
-        edge_values={"weight": weights},
-        node_values={"club": clubs},
-        name="Zachary's Karate Club",
-        use_compat_graph=use_compat_graph,
-    )
-
-
-@networkx_algorithm(version_added="23.12")
-def les_miserables_graph():
-    # fmt: off
-    src_indices = cp.array(
-        [
-            0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
-            6, 6, 6, 6, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10, 10,
-            10, 10, 10, 10, 11, 12, 12, 12, 12, 12, 12, 13, 13, 14, 14, 15, 15, 15,
-            15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17,
-            17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19,
-            20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 23, 23, 23,
-            23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-            24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
-            26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28,
-            28, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30,
-            30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-            31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
-            35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
-            38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
-            40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 41, 42, 42, 42, 42, 42,
-            42, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 46,
-            46, 46, 46, 46, 46, 46, 47, 47, 48, 48, 49, 49, 49, 49, 49, 49, 49, 49,
-            49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 50, 50, 50, 51, 51, 51, 51,
-            51, 51, 51, 52, 53, 53, 54, 55, 55, 55, 55, 55, 55, 55, 56, 56, 56, 57,
-            57, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59,
-            59, 59, 59, 60, 60, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 64,
-            65, 65, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 68, 69, 69, 69,
-            69, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 71,
-            71, 71, 71, 71, 71, 71, 71, 71, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73,
-            73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
-            73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 74, 74, 75, 75, 75, 76, 76,
-            76, 76, 76, 76, 76,
-        ],
-        index_dtype,
-    )
-    dst_indices = cp.array(
-        [
-            25, 58, 70, 9, 15, 25, 31, 37, 39, 58, 59, 70, 73, 6, 17, 21, 24, 30, 31,
-            35, 40, 46, 49, 55, 67, 8, 10, 12, 16, 27, 39, 42, 73, 34, 49, 23, 26,
-            27, 29, 44, 71, 76, 2, 17, 21, 24, 30, 31, 35, 40, 46, 49, 55, 67, 73,
-            70, 3, 10, 12, 16, 42, 73, 1, 15, 25, 31, 37, 59, 70, 3, 8, 12, 16, 42,
-            73, 62, 3, 8, 10, 16, 42, 73, 14, 31, 13, 31, 1, 9, 24, 25, 37, 39, 58,
-            59, 70, 73, 3, 8, 10, 12, 42, 73, 2, 6, 21, 24, 30, 31, 35, 40, 46, 49,
-            67, 34, 39, 45, 49, 51, 58, 70, 71, 72, 73, 75, 62, 62, 2, 6, 17, 24, 25,
-            30, 31, 35, 40, 46, 49, 55, 67, 62, 5, 26, 27, 29, 44, 71, 76, 2, 6, 15,
-            17, 21, 30, 31, 35, 39, 40, 46, 49, 55, 67, 73, 0, 1, 9, 15, 21, 37, 46,
-            49, 58, 59, 70, 5, 23, 27, 29, 44, 71, 76, 3, 5, 23, 26, 29, 39, 44, 48,
-            58, 65, 69, 70, 71, 73, 76, 36, 39, 60, 73, 5, 23, 26, 27, 44, 71, 76, 2,
-            6, 17, 21, 24, 31, 35, 40, 46, 49, 67, 1, 2, 6, 9, 13, 14, 17, 21, 24,
-            30, 35, 37, 39, 40, 46, 49, 53, 55, 59, 67, 70, 73, 62, 73, 4, 18, 45,
-            47, 49, 51, 73, 2, 6, 17, 21, 24, 30, 31, 40, 55, 67, 28, 1, 9, 15, 25,
-            31, 39, 58, 59, 70, 73, 73, 1, 3, 15, 18, 24, 27, 28, 31, 37, 58, 59, 69,
-            70, 72, 73, 74, 75, 2, 6, 17, 21, 24, 30, 31, 35, 46, 49, 55, 67, 53, 3,
-            8, 10, 12, 16, 73, 73, 5, 23, 26, 27, 29, 71, 76, 18, 34, 49, 51, 2, 6,
-            17, 21, 24, 25, 30, 31, 40, 49, 61, 34, 58, 27, 73, 2, 4, 6, 17, 18, 21,
-            24, 25, 30, 31, 34, 40, 45, 46, 51, 66, 70, 71, 73, 56, 62, 73, 18, 34,
-            45, 49, 52, 57, 73, 51, 31, 41, 73, 2, 6, 21, 24, 31, 35, 40, 50, 62, 73,
-            51, 66, 0, 1, 15, 18, 25, 27, 37, 39, 47, 70, 73, 1, 9, 15, 25, 31, 37,
-            39, 70, 73, 28, 73, 46, 11, 19, 20, 22, 32, 50, 56, 63, 64, 73, 62, 62,
-            27, 69, 49, 57, 70, 2, 6, 17, 21, 24, 30, 31, 35, 40, 73, 27, 39, 65, 73,
-            0, 1, 7, 9, 15, 18, 25, 27, 31, 37, 39, 49, 58, 59, 66, 73, 5, 18, 23,
-            26, 27, 29, 44, 49, 76, 18, 39, 73, 1, 3, 6, 8, 10, 12, 15, 16, 18, 24,
-            27, 28, 31, 33, 34, 37, 38, 39, 42, 43, 48, 49, 50, 51, 54, 56, 58, 59,
-            60, 62, 68, 69, 70, 72, 74, 75, 39, 73, 18, 39, 73, 5, 23, 26, 27, 29,
-            44, 71,
-        ],
-        index_dtype,
-    )
-    weights = cp.array(
-        [
-            2, 1, 2, 3, 4, 1, 1, 6, 2, 1, 2, 6, 1, 4, 5, 6, 4, 3, 5, 1, 5, 2, 1, 1,
-            2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 4, 3, 4, 4, 4, 3, 4, 9, 12, 10, 6, 5,
-            3, 7, 1, 5, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 2,
-            2, 3, 3, 1, 1, 2, 2, 2, 2, 2, 3, 2, 3, 2, 4, 1, 1, 1, 4, 1, 1, 2, 4, 1,
-            1, 2, 2, 2, 2, 2, 5, 9, 13, 15, 5, 6, 1, 5, 2, 5, 2, 3, 1, 1, 21, 2, 4,
-            1, 1, 2, 31, 1, 2, 1, 6, 12, 13, 17, 1, 6, 7, 2, 5, 2, 9, 1, 3, 1, 3, 3,
-            4, 5, 3, 3, 4, 4, 10, 1, 15, 17, 6, 7, 3, 6, 5, 1, 7, 1, 4, 4, 2, 1, 1,
-            1, 1, 1, 1, 5, 2, 1, 3, 4, 3, 3, 3, 4, 4, 3, 1, 3, 4, 3, 4, 5, 3, 2, 2,
-            1, 2, 1, 3, 9, 4, 2, 1, 3, 8, 4, 5, 3, 4, 3, 3, 4, 3, 6, 5, 6, 6, 2, 1,
-            5, 1, 1, 2, 1, 5, 5, 1, 2, 2, 6, 7, 7, 2, 1, 1, 1, 3, 1, 4, 2, 1, 1, 1,
-            1, 1, 1, 1, 1, 3, 1, 1, 12, 9, 2, 1, 3, 1, 2, 3, 1, 1, 2, 1, 1, 2, 6, 3,
-            4, 1, 1, 1, 1, 2, 5, 1, 1, 2, 1, 1, 1, 6, 5, 1, 1, 1, 1, 1, 1, 5, 1, 17,
-            1, 1, 5, 7, 5, 5, 5, 5, 3, 2, 1, 2, 1, 2, 1, 2, 2, 3, 2, 2, 3, 1, 4, 3,
-            4, 3, 3, 4, 3, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1,
-            1, 1, 5, 5, 21, 9, 7, 5, 1, 4, 12, 2, 1, 1, 6, 1, 2, 1, 19, 6, 8, 3, 2,
-            9, 2, 6, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 10, 3, 1, 1, 1, 1,
-            1, 4, 2, 2, 1, 1, 1, 13, 7, 2, 1, 2, 1, 1, 2, 1, 1, 1, 3, 1, 3, 1, 2, 1,
-            1, 1, 8, 10, 1, 1, 5, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 3, 4, 2, 1, 1, 2, 1,
-            2, 1, 2, 3, 2, 6, 1, 3, 4, 1, 3, 1, 1, 5, 5, 2, 13, 1, 1, 12, 4, 1, 3, 4,
-            3, 3, 4, 1, 3, 2, 1, 1, 1, 2, 1, 2, 3, 2, 1, 2, 31, 4, 9, 8, 1, 1, 2, 1,
-            1, 17, 3, 1, 1, 19, 3, 2, 1, 3, 7, 1, 1, 5, 1, 3, 12, 1, 2, 3, 1, 2, 1,
-            1, 3, 3, 4, 3, 4, 4, 3, 3,
-        ],
-        np.int8,
-    )
-    nodes = [
-        "Anzelma", "Babet", "Bahorel", "Bamatabois", "BaronessT", "Blacheville",
-        "Bossuet", "Boulatruelle", "Brevet", "Brujon", "Champmathieu",
-        "Champtercier", "Chenildieu", "Child1", "Child2", "Claquesous",
-        "Cochepaille", "Combeferre", "Cosette", "Count", "CountessDeLo",
-        "Courfeyrac", "Cravatte", "Dahlia", "Enjolras", "Eponine", "Fameuil",
-        "Fantine", "Fauchelevent", "Favourite", "Feuilly", "Gavroche", "Geborand",
-        "Gervais", "Gillenormand", "Grantaire", "Gribier", "Gueulemer", "Isabeau",
-        "Javert", "Joly", "Jondrette", "Judge", "Labarre", "Listolier",
-        "LtGillenormand", "Mabeuf", "Magnon", "Marguerite", "Marius",
-        "MlleBaptistine", "MlleGillenormand", "MlleVaubois", "MmeBurgon", "MmeDeR",
-        "MmeHucheloup", "MmeMagloire", "MmePontmercy", "MmeThenardier",
-        "Montparnasse", "MotherInnocent", "MotherPlutarch", "Myriel", "Napoleon",
-        "OldMan", "Perpetue", "Pontmercy", "Prouvaire", "Scaufflaire", "Simplice",
-        "Thenardier", "Tholomyes", "Toussaint", "Valjean", "Woman1", "Woman2",
-        "Zephine",
-    ]
-    # fmt: on
-    use_compat_graph = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-    return nxcg.CudaGraph.from_coo(
-        77,
-        src_indices,
-        dst_indices,
-        edge_values={"weight": weights},
-        id_to_key=nodes,
-        use_compat_graph=use_compat_graph,
-    )
diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
deleted file mode 100644
index 1a3d08409a2..00000000000
--- a/python/nx-cugraph/nx_cugraph/interface.py
+++ /dev/null
@@ -1,486 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import os
-import sys
-
-import networkx as nx
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-
-class BackendInterface:
-    # Required conversions
-    @staticmethod
-    def convert_from_nx(graph, *args, edge_attrs=None, weight=None, **kwargs):
-        if weight is not None:
-            # MAINT: networkx 3.0, 3.1
-            # For networkx 3.0 and 3.1 compatibility
-            if edge_attrs is not None:
-                raise TypeError(
-                    "edge_attrs and weight arguments should not both be given"
-                )
-            edge_attrs = {weight: 1}
-        return nxcg.from_networkx(
-            graph,
-            *args,
-            edge_attrs=edge_attrs,
-            use_compat_graph=_nxver < (3, 3)
-            or nx.config.backends.cugraph.use_compat_graphs,
-            **kwargs,
-        )
-
-    @staticmethod
-    def convert_to_nx(obj, *, name: str | None = None):
-        if isinstance(obj, nxcg.CudaGraph):
-            # Observe that this does not try to convert Graph!
-            return nxcg.to_networkx(obj)
-        return obj
-
-    @staticmethod
-    def on_start_tests(items):
-        """Modify pytest items after tests have been collected.
-
-        This is called during ``pytest_collection_modifyitems`` phase of pytest.
-        We use this to set `xfail` on tests we expect to fail. See:
-
-        https://docs.pytest.org/en/stable/reference/reference.html#std-hook-pytest_collection_modifyitems
-        """
-        try:
-            import pytest
-        except ModuleNotFoundError:
-            return
-
-        def key(testpath):
-            filename, path = testpath.split(":")
-            *names, testname = path.split(".")
-            if names:
-                [classname] = names
-                return (testname, frozenset({classname, filename}))
-            return (testname, frozenset({filename}))
-
-        use_compat_graph = (
-            _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-        )
-        fallback = use_compat_graph or nx.utils.backends._dispatchable._fallback_to_nx
-
-        # Reasons for xfailing
-        # For nx version <= 3.1
-        no_weights = "weighted implementation not currently supported"
-        no_multigraph = "multigraphs not currently supported"
-        # For nx version <= 3.2
-        nx_cugraph_in_test_setup = (
-            "nx-cugraph Graph is incompatible in test setup in nx versions < 3.3"
-        )
-        # For all versions
-        louvain_different = "Louvain may be different due to RNG"
-        sssp_path_different = "sssp may choose a different valid path"
-        tuple_elements_preferred = "elements are tuples instead of lists"
-        no_mixed_dtypes_for_nodes = (
-            # This one is tricky b/c we don't raise; all dtypes are treated as str
-            "mixed dtypes (str, int, float) for single node property not supported"
-        )
-        # These shouldn't fail if using Graph or falling back to networkx
-        no_string_dtype = "string edge values not currently supported"
-        no_object_dtype_for_edges = (
-            "Edges don't support object dtype (lists, strings, etc.)"
-        )
-
-        xfail = {
-            # This is removed while strongly_connected_components() is not
-            # dispatchable. See algorithms/components/strongly_connected.py for
-            # details.
-            #
-            # key(
-            #     "test_strongly_connected.py:"
-            #     "TestStronglyConnected.test_condensation_mapping_and_members"
-            # ): "Strongly connected groups in different iteration order",
-            key(
-                "test_cycles.py:TestMinimumCycleBasis.test_unweighted_diamond"
-            ): sssp_path_different,
-            key(
-                "test_cycles.py:TestMinimumCycleBasis.test_weighted_diamond"
-            ): sssp_path_different,
-            key(
-                "test_cycles.py:TestMinimumCycleBasis.test_petersen_graph"
-            ): sssp_path_different,
-            key(
-                "test_cycles.py:TestMinimumCycleBasis."
-                "test_gh6787_and_edge_attribute_names"
-            ): sssp_path_different,
-            key(
-                "test_relabel.py:"
-                "test_relabel_preserve_node_order_partial_mapping_with_copy_false"
-            ): "Node order is preserved when relabeling with partial mapping",
-            key(
-                "test_gml.py:"
-                "TestPropertyLists.test_reading_graph_with_single_element_list_property"
-            ): tuple_elements_preferred,
-        }
-        if not fallback:
-            xfail.update(
-                {
-                    key(
-                        "test_graph_hashing.py:test_isomorphic_edge_attr"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_graph_hashing.py:test_isomorphic_edge_attr_and_node_attr"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_graph_hashing.py:test_isomorphic_edge_attr_subgraph_hash"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_graph_hashing.py:"
-                        "test_isomorphic_edge_attr_and_node_attr_subgraph_hash"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_summarization.py:TestSNAPNoEdgeTypes.test_summary_graph"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_summarization.py:TestSNAPUndirected.test_summary_graph"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_summarization.py:TestSNAPDirected.test_summary_graph"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_gexf.py:TestGEXF.test_relabel"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_gml.py:TestGraph.test_parse_gml_cytoscape_bug"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_gml.py:TestGraph.test_parse_gml"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_gml.py:TestGraph.test_read_gml"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_gml.py:TestGraph.test_data_types"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_gml.py:"
-                        "TestPropertyLists.test_reading_graph_with_list_property"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_relabel.py:"
-                        "TestRelabel.test_relabel_multidigraph_inout_merge_nodes"
-                    ): no_string_dtype,
-                    key(
-                        "test_relabel.py:"
-                        "TestRelabel.test_relabel_multigraph_merge_inplace"
-                    ): no_string_dtype,
-                    key(
-                        "test_relabel.py:"
-                        "TestRelabel.test_relabel_multidigraph_merge_inplace"
-                    ): no_string_dtype,
-                    key(
-                        "test_relabel.py:"
-                        "TestRelabel.test_relabel_multidigraph_inout_copy"
-                    ): no_string_dtype,
-                    key(
-                        "test_relabel.py:TestRelabel.test_relabel_multigraph_merge_copy"
-                    ): no_string_dtype,
-                    key(
-                        "test_relabel.py:"
-                        "TestRelabel.test_relabel_multidigraph_merge_copy"
-                    ): no_string_dtype,
-                    key(
-                        "test_relabel.py:"
-                        "TestRelabel.test_relabel_multigraph_nonnumeric_key"
-                    ): no_string_dtype,
-                    key(
-                        "test_contraction.py:test_multigraph_path"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_contraction.py:test_directed_multigraph_path"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_contraction.py:test_multigraph_blockmodel"
-                    ): no_object_dtype_for_edges,
-                    key(
-                        "test_summarization.py:"
-                        "TestSNAPUndirectedMulti.test_summary_graph"
-                    ): no_string_dtype,
-                    key(
-                        "test_summarization.py:TestSNAPDirectedMulti.test_summary_graph"
-                    ): no_string_dtype,
-                }
-            )
-        else:
-            xfail.update(
-                {
-                    key(
-                        "test_gml.py:"
-                        "TestPropertyLists.test_reading_graph_with_list_property"
-                    ): no_mixed_dtypes_for_nodes,
-                }
-            )
-
-        if _nxver <= (3, 2):
-            xfail.update(
-                {
-                    # NetworkX versions prior to 3.2.1 have tests written to
-                    # expect sp.sparse.linalg.ArpackNoConvergence exceptions
-                    # raised on no convergence in HITS. Newer versions since
-                    # the merge of
-                    # https://github.com/networkx/networkx/pull/7084 expect
-                    # nx.PowerIterationFailedConvergence, which is what
-                    # nx_cugraph.hits raises, so we mark them as xfail for
-                    # previous versions of NX.
-                    key(
-                        "test_hits.py:TestHITS.test_hits_not_convergent"
-                    ): "nx_cugraph.hits raises updated exceptions not caught in "
-                    "these tests",
-                    # NetworkX versions 3.2 and older contain tests that fail
-                    # with pytest>=8. Assume pytest>=8 and mark xfail.
-                    key(
-                        "test_strongly_connected.py:"
-                        "TestStronglyConnected.test_connected_raise"
-                    ): "test is incompatible with pytest>=8",
-                    # NetworkX 3.3 introduced logic around functions that return graphs
-                    key(
-                        "test_vf2pp_helpers.py:TestGraphTinoutUpdating.test_updating"
-                    ): nx_cugraph_in_test_setup,
-                    key(
-                        "test_vf2pp_helpers.py:TestGraphTinoutUpdating.test_restoring"
-                    ): nx_cugraph_in_test_setup,
-                    key(
-                        "test_vf2pp_helpers.py:TestDiGraphTinoutUpdating.test_updating"
-                    ): nx_cugraph_in_test_setup,
-                    key(
-                        "test_vf2pp_helpers.py:TestDiGraphTinoutUpdating.test_restoring"
-                    ): nx_cugraph_in_test_setup,
-                }
-            )
-
-        if _nxver <= (3, 1):
-            # MAINT: networkx 3.0, 3.1
-            # NetworkX 3.2 added the ability to "fallback to nx" if backend algorithms
-            # raise NotImplementedError or `can_run` returns False. The tests below
-            # exercise behavior we have not implemented yet, so we mark them as xfail
-            # for previous versions of NX.
-            xfail.update(
-                {
-                    key(
-                        "test_agraph.py:TestAGraph.test_no_warnings_raised"
-                    ): "pytest.warn(None) deprecated",
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_K5"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_P3_normalized"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_P3"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_krackhardt_kite_graph"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality."
-                        "test_krackhardt_kite_graph_normalized"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality."
-                        "test_florentine_families_graph"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_les_miserables_graph"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_ladder_graph"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_G"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_G2"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_G3"
-                    ): no_multigraph,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedBetweennessCentrality.test_G4"
-                    ): no_multigraph,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality.test_K5"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality.test_C4"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality.test_P4"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality.test_balanced_tree"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality.test_weighted_graph"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality."
-                        "test_normalized_weighted_graph"
-                    ): no_weights,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality.test_weighted_multigraph"
-                    ): no_multigraph,
-                    key(
-                        "test_betweenness_centrality.py:"
-                        "TestWeightedEdgeBetweennessCentrality."
-                        "test_normalized_weighted_multigraph"
-                    ): no_multigraph,
-                }
-            )
-        else:
-            xfail.update(
-                {
-                    key(
-                        "test_louvain.py:test_karate_club_partition"
-                    ): louvain_different,
-                    key("test_louvain.py:test_none_weight_param"): louvain_different,
-                    key("test_louvain.py:test_multigraph"): louvain_different,
-                    # See networkx#6630
-                    key(
-                        "test_louvain.py:test_undirected_selfloops"
-                    ): "self-loops not handled in Louvain",
-                }
-            )
-            if sys.version_info[:2] == (3, 9):
-                # This test is sensitive to RNG, which depends on Python version
-                xfail[key("test_louvain.py:test_threshold")] = (
-                    "Louvain does not support seed parameter"
-                )
-            if _nxver >= (3, 2):
-                if not fallback:
-                    xfail.update(
-                        {
-                            key(
-                                "test_convert_pandas.py:TestConvertPandas."
-                                "test_from_edgelist_multi_attr_incl_target"
-                            ): no_string_dtype,
-                            key(
-                                "test_convert_pandas.py:TestConvertPandas."
-                                "test_from_edgelist_multidigraph_and_edge_attr"
-                            ): no_string_dtype,
-                            key(
-                                "test_convert_pandas.py:TestConvertPandas."
-                                "test_from_edgelist_int_attr_name"
-                            ): no_string_dtype,
-                        }
-                    )
-                if _nxver[1] == 2:
-                    different_iteration_order = "Different graph data iteration order"
-                    xfail.update(
-                        {
-                            key(
-                                "test_cycles.py:TestMinimumCycleBasis."
-                                "test_gh6787_and_edge_attribute_names"
-                            ): different_iteration_order,
-                            key(
-                                "test_euler.py:TestEulerianCircuit."
-                                "test_eulerian_circuit_cycle"
-                            ): different_iteration_order,
-                            key(
-                                "test_gml.py:TestGraph.test_special_float_label"
-                            ): different_iteration_order,
-                        }
-                    )
-                elif _nxver[1] >= 3:
-                    xfail.update(
-                        {
-                            key("test_louvain.py:test_max_level"): louvain_different,
-                        }
-                    )
-
-        too_slow = "Too slow to run"
-        skip = {
-            key("test_tree_isomorphism.py:test_positive"): too_slow,
-            key("test_tree_isomorphism.py:test_negative"): too_slow,
-            # These repeatedly call `bfs_layers`, which converts the graph every call
-            key(
-                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph2_different_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph3_same_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph3_different_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph4_same_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp.py:TestGraphISOVF2pp."
-                "test_disconnected_graph_all_same_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp.py:TestGraphISOVF2pp."
-                "test_disconnected_graph_all_different_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp.py:TestGraphISOVF2pp."
-                "test_disconnected_graph_some_same_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp.py:TestMultiGraphISOVF2pp."
-                "test_custom_multigraph3_same_labels"
-            ): too_slow,
-            key(
-                "test_vf2pp_helpers.py:TestNodeOrdering."
-                "test_matching_order_all_branches"
-            ): too_slow,
-        }
-        if os.environ.get("PYTEST_NO_SKIP", False):
-            skip.clear()
-
-        for item in items:
-            kset = set(item.keywords)
-            for (test_name, keywords), reason in xfail.items():
-                if item.name == test_name and keywords.issubset(kset):
-                    item.add_marker(pytest.mark.xfail(reason=reason))
-            for (test_name, keywords), reason in skip.items():
-                if item.name == test_name and keywords.issubset(kset):
-                    item.add_marker(pytest.mark.skip(reason=reason))
-
-    @classmethod
-    def can_run(cls, name, args, kwargs):
-        """Can this backend run the specified algorithms with the given arguments?"""
-        # TODO: drop hasattr when networkx 3.0 support is dropped
-        return hasattr(cls, name) and getattr(cls, name).can_run(*args, **kwargs)
-
-    @classmethod
-    def should_run(cls, name, args, kwargs):
-        """Should this backend run the specified algorithms with the given arguments?"""
-        # TODO: drop hasattr when networkx 3.0 support is dropped
-        return hasattr(cls, name) and getattr(cls, name).should_run(*args, **kwargs)
diff --git a/python/nx-cugraph/nx_cugraph/relabel.py b/python/nx-cugraph/nx_cugraph/relabel.py
deleted file mode 100644
index e38e18c779e..00000000000
--- a/python/nx-cugraph/nx_cugraph/relabel.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-from collections import defaultdict
-
-import cupy as cp
-import networkx as nx
-import numpy as np
-
-import nx_cugraph as nxcg
-
-from .utils import _get_int_dtype, _groupby, index_dtype, networkx_algorithm
-
-__all__ = [
-    "convert_node_labels_to_integers",
-    "relabel_nodes",
-]
-
-
-@networkx_algorithm(version_added="24.08")
-def relabel_nodes(G, mapping, copy=True):
-    G_orig = G
-    if isinstance(G, nx.Graph):
-        is_compat_graph = isinstance(G, nxcg.Graph)
-        if not copy and not is_compat_graph:
-            raise RuntimeError(
-                "Using `copy=False` is invalid when using a NetworkX graph "
-                "as input to `nx_cugraph.relabel_nodes`"
-            )
-        G = nxcg.from_networkx(G, preserve_all_attrs=True)
-    else:
-        is_compat_graph = False
-
-    it = range(G._N) if G.key_to_id is None else G.id_to_key
-    if callable(mapping):
-        previd_to_key = [mapping(node) for node in it]
-    else:
-        previd_to_key = [mapping.get(node, node) for node in it]
-    if not copy:
-        # Our implementation does not need to raise here, but do so to match networkx.
-        it = range(G._N) if G.key_to_id is None else G.id_to_key
-        D = nx.DiGraph([(x, y) for x, y in zip(it, previd_to_key) if x != y])
-        if nx.algorithms.dag.has_cycle(D):
-            raise nx.NetworkXUnfeasible(
-                "The node label sets are overlapping and no ordering can "
-                "resolve the mapping. Use copy=True."
-            )
-    key_to_previd = {val: i for i, val in enumerate(previd_to_key)}
-    newid_to_key = list(key_to_previd)
-    key_to_newid = dict(zip(newid_to_key, range(len(newid_to_key))))
-
-    src_indices = G.src_indices
-    dst_indices = G.dst_indices
-    edge_values = G.edge_values
-    edge_masks = G.edge_masks
-    node_values = G.node_values
-    node_masks = G.node_masks
-    if G.is_multigraph():
-        edge_indices = G.edge_indices
-        edge_keys = G.edge_keys
-    if len(key_to_previd) != G._N:
-        # Some nodes were combined.
-        # Node data doesn't get merged, so use the data from the last shared index
-        int_dtype = _get_int_dtype(G._N - 1)
-        node_indices = cp.fromiter(key_to_previd.values(), int_dtype)
-        node_indices_np = node_indices.get()  # Node data may be cupy or numpy arrays
-        node_values = {key: val[node_indices_np] for key, val in node_values.items()}
-        node_masks = {key: val[node_indices_np] for key, val in node_masks.items()}
-
-        # Renumber, but will have duplicates
-        translations = cp.fromiter(
-            (key_to_newid[key] for key in previd_to_key), index_dtype
-        )
-        src_indices_dup = translations[src_indices]
-        dst_indices_dup = translations[dst_indices]
-
-        if G.is_multigraph():
-            # No merging necessary for multigraphs.
-            if G.is_directed():
-                src_indices = src_indices_dup
-                dst_indices = dst_indices_dup
-            else:
-                # New self-edges should have one edge entry, not two
-                mask = (
-                    # Not self-edges, no need to deduplicate
-                    (src_indices_dup != dst_indices_dup)
-                    # == : already self-edges; no need to deduplicate
-                    # < : if new self-edges, keep where src < dst
-                    | (src_indices <= dst_indices)
-                )
-                if mask.all():
-                    src_indices = src_indices_dup
-                    dst_indices = dst_indices_dup
-                else:
-                    src_indices = src_indices_dup[mask]
-                    dst_indices = dst_indices_dup[mask]
-                    if edge_values:
-                        edge_values = {
-                            key: val[mask] for key, val in edge_values.items()
-                        }
-                        edge_masks = {key: val[mask] for key, val in edge_masks.items()}
-                    if edge_keys is not None:
-                        edge_keys = [
-                            key for keep, key in zip(mask.tolist(), edge_keys) if keep
-                        ]
-                    if edge_indices is not None:
-                        edge_indices = edge_indices[mask]
-            # Handling of `edge_keys` and `edge_indices` is pure Python to match nx.
-            # This may be slower than we'd like; if it's way too slow, should we
-            # direct users to use the defaults of None?
-            if edge_keys is not None:
-                seen = set()
-                new_edge_keys = []
-                for key in zip(src_indices.tolist(), dst_indices.tolist(), edge_keys):
-                    if key in seen:
-                        src, dst, edge_key = key
-                        if not isinstance(edge_key, (int, float)):
-                            edge_key = 0
-                        for edge_key in itertools.count(edge_key):
-                            if (src, dst, edge_key) not in seen:
-                                seen.add((src, dst, edge_key))
-                                break
-                    else:
-                        seen.add(key)
-                        edge_key = key[2]
-                    new_edge_keys.append(edge_key)
-                edge_keys = new_edge_keys
-            if edge_indices is not None:
-                # PERF: can we do this using cupy?
-                seen = set()
-                new_edge_indices = []
-                for key in zip(
-                    src_indices.tolist(), dst_indices.tolist(), edge_indices.tolist()
-                ):
-                    if key in seen:
-                        src, dst, edge_index = key
-                        for edge_index in itertools.count(edge_index):
-                            if (src, dst, edge_index) not in seen:
-                                seen.add((src, dst, edge_index))
-                                break
-                    else:
-                        seen.add(key)
-                        edge_index = key[2]
-                    new_edge_indices.append(edge_index)
-                edge_indices = cp.array(new_edge_indices, index_dtype)
-        else:
-            stacked_dup = cp.vstack((src_indices_dup, dst_indices_dup))
-            if not edge_values:
-                # Drop duplicates
-                stacked = cp.unique(stacked_dup, axis=1)
-            else:
-                # Drop duplicates. This relies heavily on `_groupby`.
-                # It has not been compared to alternative implementations.
-                # I wonder if there are ways to use assignment using duplicate indices.
-                (stacked, ind, inv) = cp.unique(
-                    stacked_dup, axis=1, return_index=True, return_inverse=True
-                )
-                if ind.dtype != int_dtype:
-                    ind = ind.astype(int_dtype)
-                if inv.dtype != int_dtype:
-                    inv = inv.astype(int_dtype)
-
-                # We need to merge edge data
-                mask = cp.ones(src_indices.size, dtype=bool)
-                mask[ind] = False
-                edge_data = [val[mask] for val in edge_values.values()]
-                edge_data.extend(val[mask] for val in edge_masks.values())
-                groups = _groupby(inv[mask], edge_data)
-
-                edge_values = {key: val[ind] for key, val in edge_values.items()}
-                edge_masks = {key: val[ind] for key, val in edge_masks.items()}
-
-                value_keys = list(edge_values.keys())
-                mask_keys = list(edge_masks.keys())
-
-                values_to_update = defaultdict(list)
-                masks_to_update = defaultdict(list)
-                for k, v in groups.items():
-                    it = iter(v)
-                    vals = dict(zip(value_keys, it))  # zip(strict=False)
-                    masks = dict(zip(mask_keys, it))  # zip(strict=True)
-                    for key, val in vals.items():
-                        if key in masks:
-                            val = val[masks[key]]
-                            if val.size > 0:
-                                values_to_update[key].append((k, val[-1]))
-                                masks_to_update[key].append((k, True))
-                        else:
-                            values_to_update[key].append((k, val[-1]))
-                            if key in edge_masks:
-                                masks_to_update[key].append((k, True))
-
-                int_dtype = _get_int_dtype(src_indices.size - 1)
-                for k, v in values_to_update.items():
-                    ii, jj = zip(*v)
-                    edge_val = edge_values[k]
-                    edge_val[cp.array(ii, dtype=int_dtype)] = cp.array(
-                        jj, dtype=edge_val.dtype
-                    )
-                for k, v in masks_to_update.items():
-                    ii, jj = zip(*v)
-                    edge_masks[k][cp.array(ii, dtype=int_dtype)] = cp.array(
-                        jj, dtype=bool
-                    )
-            src_indices = stacked[0]
-            dst_indices = stacked[1]
-
-    if G.is_multigraph():
-        # `edge_keys` and `edge_indices` are preserved for free if no nodes were merged
-        extra_kwargs = {"edge_keys": edge_keys, "edge_indices": edge_indices}
-    else:
-        extra_kwargs = {}
-    rv = G.__class__.from_coo(
-        len(key_to_previd),
-        src_indices,
-        dst_indices,
-        edge_values=edge_values,
-        edge_masks=edge_masks,
-        node_values=node_values,
-        node_masks=node_masks,
-        id_to_key=newid_to_key,
-        key_to_id=key_to_newid,
-        use_compat_graph=is_compat_graph,
-        **extra_kwargs,
-    )
-    rv.graph.update(G.graph)
-    if not copy:
-        G_orig._become(rv)
-        return G_orig
-    return rv
-
-
-@networkx_algorithm(version_added="24.08")
-def convert_node_labels_to_integers(
-    G, first_label=0, ordering="default", label_attribute=None
-):
-    if ordering not in {"default", "sorted", "increasing degree", "decreasing degree"}:
-        raise nx.NetworkXError(f"Unknown node ordering: {ordering}")
-    if isinstance(G, nx.Graph):
-        is_compat_graph = isinstance(G, nxcg.Graph)
-        G = nxcg.from_networkx(G, preserve_all_attrs=True)
-    else:
-        is_compat_graph = False
-    G = G.copy()
-    if label_attribute is not None:
-        prev_vals = G.id_to_key
-        if prev_vals is None:
-            prev_vals = cp.arange(G._N, dtype=_get_int_dtype(G._N - 1))
-        else:
-            try:
-                prev_vals = np.array(prev_vals)
-            except ValueError:
-                prev_vals = np.fromiter(prev_vals, object)
-            else:
-                try:
-                    prev_vals = cp.array(prev_vals)
-                except ValueError:
-                    pass
-        G.node_values[label_attribute] = prev_vals
-        G.node_masks.pop(label_attribute, None)
-    id_to_key = None
-    if ordering == "default" or ordering == "sorted" and G.key_to_id is None:
-        if first_label == 0:
-            G.key_to_id = None
-        else:
-            id_to_key = list(range(first_label, first_label + G._N))
-            G.key_to_id = dict(zip(id_to_key, range(G._N)))
-    elif ordering == "sorted":
-        key_to_id = G.key_to_id
-        G.key_to_id = {
-            i: key_to_id[n] for i, n in enumerate(sorted(key_to_id), first_label)
-        }
-    else:
-        pairs = sorted(
-            ((d, n) for (n, d) in G._nodearray_to_dict(G._degrees_array()).items()),
-            reverse=ordering == "decreasing degree",
-        )
-        key_to_id = G.key_to_id
-        G.key_to_id = {i: key_to_id[n] for i, (d, n) in enumerate(pairs, first_label)}
-    G._id_to_key = id_to_key
-    if is_compat_graph:
-        return G._to_compat_graph()
-    return G
diff --git a/python/nx-cugraph/nx_cugraph/scripts/__init__.py b/python/nx-cugraph/nx_cugraph/scripts/__init__.py
deleted file mode 100644
index aeae6078111..00000000000
--- a/python/nx-cugraph/nx_cugraph/scripts/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/nx-cugraph/nx_cugraph/scripts/__main__.py b/python/nx-cugraph/nx_cugraph/scripts/__main__.py
deleted file mode 100755
index c0963e64cc5..00000000000
--- a/python/nx-cugraph/nx_cugraph/scripts/__main__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-if __name__ == "__main__":
-    import argparse
-
-    from nx_cugraph.scripts import print_table, print_tree
-
-    parser = argparse.ArgumentParser(
-        parents=[
-            print_table.get_argumentparser(add_help=False),
-            print_tree.get_argumentparser(add_help=False),
-        ],
-        description="Print info about functions implemented by nx-cugraph",
-    )
-    parser.add_argument("action", choices=["print_table", "print_tree"])
-    args = parser.parse_args()
-    if args.action == "print_table":
-        print_table.main()
-    else:
-        print_tree.main(
-            by=args.by,
-            networkx_path=args.networkx_path,
-            dispatch_name=args.dispatch_name or args.dispatch_name_always,
-            version_added=args.version_added,
-            plc=args.plc,
-            dispatch_name_if_different=not args.dispatch_name_always,
-        )
diff --git a/python/nx-cugraph/nx_cugraph/scripts/print_table.py b/python/nx-cugraph/nx_cugraph/scripts/print_table.py
deleted file mode 100755
index 7c90281247c..00000000000
--- a/python/nx-cugraph/nx_cugraph/scripts/print_table.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import sys
-from collections import namedtuple
-
-from networkx.utils.backends import _registered_algorithms as algos
-
-from _nx_cugraph import get_info
-from nx_cugraph.interface import BackendInterface
-
-
-def get_funcpath(func):
-    return f"{func.__module__}.{func.__name__}"
-
-
-def get_path_to_name():
-    return {
-        get_funcpath(algos[funcname]): funcname
-        for funcname in get_info()["functions"].keys() & algos.keys()
-    }
-
-
-Info = namedtuple(
-    "Info",
-    "networkx_path, dispatch_name, version_added, plc, is_incomplete, is_different",
-)
-
-
-def get_path_to_info(path_to_name=None, version_added_sep=".", plc_sep="/"):
-    if path_to_name is None:
-        path_to_name = get_path_to_name()
-    rv = {}
-    for funcpath in sorted(path_to_name):
-        funcname = path_to_name[funcpath]
-        cufunc = getattr(BackendInterface, funcname)
-        plc = plc_sep.join(sorted(cufunc._plc_names)) if cufunc._plc_names else ""
-        version_added = cufunc.version_added.replace(".", version_added_sep)
-        is_incomplete = cufunc.is_incomplete
-        is_different = cufunc.is_different
-        rv[funcpath] = Info(
-            funcpath, funcname, version_added, plc, is_incomplete, is_different
-        )
-    return rv
-
-
-def main(path_to_info=None, *, file=sys.stdout):
-    if path_to_info is None:
-        path_to_info = get_path_to_info(version_added_sep=".")
-    lines = ["networkx_path,dispatch_name,version_added,plc,is_incomplete,is_different"]
-    lines.extend(",".join(map(str, info)) for info in path_to_info.values())
-    text = "\n".join(lines)
-    if file is not None:
-        print(text, file=file)
-    return text
-
-
-def get_argumentparser(add_help=True):
-    return argparse.ArgumentParser(
-        description="Print info about functions implemented by nx-cugraph as CSV",
-        add_help=add_help,
-    )
-
-
-if __name__ == "__main__":
-    parser = get_argumentparser()
-    args = parser.parse_args()
-    main()
diff --git a/python/nx-cugraph/nx_cugraph/scripts/print_tree.py b/python/nx-cugraph/nx_cugraph/scripts/print_tree.py
deleted file mode 100755
index fbb1c3dd0c5..00000000000
--- a/python/nx-cugraph/nx_cugraph/scripts/print_tree.py
+++ /dev/null
@@ -1,271 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import sys
-
-from nx_cugraph.scripts.print_table import Info, get_path_to_info
-
-
-def assoc_in(d, keys, value):
-    """Like Clojure's assoc-in, but modifies d in-place."""
-    inner = d
-    keys = iter(keys)
-    key = next(keys)
-    for next_key in keys:
-        if key not in inner:
-            inner[key] = {}
-        inner = inner[key]
-        key = next_key
-    inner[key] = value
-    return d
-
-
-def default_get_payload_internal(keys):
-    return keys[-1]
-
-
-def tree_lines(
-    tree,
-    parents=(),
-    are_levels_closing=(),
-    get_payload_internal=default_get_payload_internal,
-):
-    pre = "".join(
-        "    " if is_level_closing else " │  "
-        for is_level_closing in are_levels_closing
-    )
-    c = "├"
-    are_levels_closing += (False,)
-    for i, (key, val) in enumerate(tree.items(), 1):
-        if i == len(tree):  # Last item
-            c = "└"
-            are_levels_closing = are_levels_closing[:-1] + (True,)
-        if isinstance(val, str):
-            yield pre + f" {c}─ " + val
-        else:
-            yield pre + f" {c}─ " + get_payload_internal((*parents, key))
-            yield from tree_lines(
-                val,
-                (*parents, key),
-                are_levels_closing,
-                get_payload_internal=get_payload_internal,
-            )
-
-
-def get_payload(
-    info,
-    *,
-    networkx_path=False,
-    dispatch_name=False,
-    version_added=False,
-    plc=False,
-    dispatch_name_if_different=False,
-    incomplete=False,
-    different=False,
-):
-    extra = []
-    if networkx_path:
-        extra.append(info.networkx_path)
-    if dispatch_name and (
-        not dispatch_name_if_different
-        or info.dispatch_name != info.networkx_path.rsplit(".", 1)[-1]
-    ):
-        extra.append(info.dispatch_name)
-    if version_added:
-        v = info.version_added
-        if len(v) != 5:
-            raise ValueError(f"Is there something wrong with version: {v!r}?")
-        extra.append(v[:2] + "." + v[-2:])
-    if plc and info.plc:
-        extra.append(info.plc)
-    if incomplete and info.is_incomplete:
-        extra.append("is-incomplete")
-    if different and info.is_different:
-        extra.append("is-different")
-    extra = ", ".join(extra)
-    if extra:
-        extra = f" ({extra})"
-    return info.networkx_path.rsplit(".", 1)[-1] + extra
-
-
-def create_tree(
-    path_to_info=None,
-    *,
-    by="networkx_path",
-    skip=0,
-    networkx_path=False,
-    dispatch_name=False,
-    version_added=False,
-    plc=False,
-    dispatch_name_if_different=False,
-    incomplete=False,
-    different=False,
-    prefix="",
-    strip_networkx=True,
-    get_payload=get_payload,
-):
-    if path_to_info is None:
-        path_to_info = get_path_to_info()
-    if strip_networkx:
-        path_to_info = {
-            key: Info(info.networkx_path.replace("networkx.", "", 1), *info[1:])
-            for key, info in path_to_info.items()
-        }
-    if isinstance(by, str):
-        by = [by]
-    # We rely on the fact that dicts maintain order
-    tree = {}
-    for info in sorted(
-        path_to_info.values(),
-        key=lambda x: (*(getattr(x, b) for b in by), x.networkx_path),
-    ):
-        if not all(getattr(info, b) for b in by):
-            continue
-        path = prefix + ".".join(getattr(info, b) for b in by)
-        payload = get_payload(
-            info,
-            networkx_path=networkx_path,
-            dispatch_name=dispatch_name,
-            version_added=version_added,
-            plc=plc,
-            dispatch_name_if_different=dispatch_name_if_different,
-            incomplete=incomplete,
-            different=different,
-        )
-        assoc_in(tree, path.split("."), payload)
-    return tree
-
-
-def main(
-    path_to_info=None,
-    *,
-    by="networkx_path",
-    networkx_path=False,
-    dispatch_name=False,
-    version_added=False,
-    plc=False,
-    dispatch_name_if_different=True,
-    incomplete=False,
-    different=False,
-    file=sys.stdout,
-):
-    if path_to_info is None:
-        path_to_info = get_path_to_info(version_added_sep="-")
-    kwargs = {
-        "networkx_path": networkx_path,
-        "dispatch_name": dispatch_name,
-        "version_added": version_added,
-        "plc": plc,
-        "dispatch_name_if_different": dispatch_name_if_different,
-        "incomplete": incomplete,
-        "different": different,
-    }
-    if by == "networkx_path":
-        tree = create_tree(path_to_info, by="networkx_path", **kwargs)
-        text = "\n".join(tree_lines(tree))
-    elif by == "plc":
-        tree = create_tree(
-            path_to_info,
-            by=["plc", "networkx_path"],
-            prefix="plc-",
-            **kwargs,
-        )
-        text = "\n".join(tree_lines(tree)).replace("plc-", "plc.")
-    elif by == "version_added":
-        tree = create_tree(
-            path_to_info,
-            by=["version_added", "networkx_path"],
-            prefix="version_added-",
-            **kwargs,
-        )
-        text = "\n".join(tree_lines(tree)).replace("version_added-", "version: ")
-        for digit in "0123456789":
-            text = text.replace(f"2{digit}-", f"2{digit}.")
-    else:
-        raise ValueError(
-            "`by` argument should be one of {'networkx_path', 'plc', 'version_added' "
-            f"got: {by}"
-        )
-    if file is not None:
-        print(text, file=file)
-    return text
-
-
-def get_argumentparser(add_help=True):
-    parser = argparse.ArgumentParser(
-        "Print a tree showing NetworkX functions implemented by nx-cugraph",
-        add_help=add_help,
-    )
-    parser.add_argument(
-        "--by",
-        choices=["networkx_path", "plc", "version_added"],
-        default="networkx_path",
-        help="How to group functions",
-    )
-    parser.add_argument(
-        "--dispatch-name",
-        "--dispatch_name",
-        action="store_true",
-        help="Show the dispatch name in parentheses if different from NetworkX name",
-    )
-    parser.add_argument(
-        "--dispatch-name-always",
-        "--dispatch_name_always",
-        action="store_true",
-        help="Always show the dispatch name in parentheses",
-    )
-    parser.add_argument(
-        "--plc",
-        "--pylibcugraph",
-        action="store_true",
-        help="Show the used pylibcugraph function in parentheses",
-    )
-    parser.add_argument(
-        "--version-added",
-        "--version_added",
-        action="store_true",
-        help="Show the version added in parentheses",
-    )
-    parser.add_argument(
-        "--networkx-path",
-        "--networkx_path",
-        action="store_true",
-        help="Show the full networkx path in parentheses",
-    )
-    parser.add_argument(
-        "--incomplete",
-        action="store_true",
-        help="Show which functions are incomplete",
-    )
-    parser.add_argument(
-        "--different",
-        action="store_true",
-        help="Show which functions are different",
-    )
-    return parser
-
-
-if __name__ == "__main__":
-    parser = get_argumentparser()
-    args = parser.parse_args()
-    main(
-        by=args.by,
-        networkx_path=args.networkx_path,
-        dispatch_name=args.dispatch_name or args.dispatch_name_always,
-        version_added=args.version_added,
-        plc=args.plc,
-        dispatch_name_if_different=not args.dispatch_name_always,
-        incomplete=args.incomplete,
-        different=args.different,
-    )
diff --git a/python/nx-cugraph/nx_cugraph/tests/__init__.py b/python/nx-cugraph/nx_cugraph/tests/__init__.py
deleted file mode 100644
index c2002fd3fb9..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/nx-cugraph/nx_cugraph/tests/bench_convert.py b/python/nx-cugraph/nx_cugraph/tests/bench_convert.py
deleted file mode 100644
index 2eb432230eb..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/bench_convert.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-
-import networkx as nx
-import numpy as np
-import pytest
-
-import nx_cugraph as nxcg
-
-try:
-    import cugraph
-except ModuleNotFoundError:
-    cugraph = None
-try:
-    import scipy
-except ModuleNotFoundError:
-    scipy = None
-
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ModuleNotFoundError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
-CREATE_USING = [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
-
-
-def _bench_helper(gpubenchmark, N, attr_kind, create_using, method):
-    G = method(N, create_using=create_using)
-    if attr_kind:
-        skip = True
-        for *_ids, edgedict in G.edges(data=True):
-            skip = not skip
-            if skip and attr_kind not in {"full", "required", "required_dtype"}:
-                continue
-            edgedict["x"] = random.randint(0, 100000)
-        if attr_kind == "preserve":
-            gpubenchmark(nxcg.from_networkx, G, preserve_edge_attrs=True)
-        elif attr_kind == "half_missing":
-            gpubenchmark(nxcg.from_networkx, G, edge_attrs={"x": None})
-        elif attr_kind == "required":
-            gpubenchmark(nxcg.from_networkx, G, edge_attrs={"x": ...})
-        elif attr_kind == "required_dtype":
-            gpubenchmark(
-                nxcg.from_networkx,
-                G,
-                edge_attrs={"x": ...},
-                edge_dtypes={"x": np.int32},
-            )
-        else:  # full, half_default
-            gpubenchmark(nxcg.from_networkx, G, edge_attrs={"x": 0})
-    else:
-        gpubenchmark(nxcg.from_networkx, G)
-
-
-def _bench_helper_cugraph(
-    gpubenchmark, N, attr_kind, create_using, method, do_renumber
-):
-    G = method(N, create_using=create_using)
-    if attr_kind:
-        for *_ids, edgedict in G.edges(data=True):
-            edgedict["x"] = random.randint(0, 100000)
-        gpubenchmark(cugraph.utilities.convert_from_nx, G, "x", do_renumber=do_renumber)
-    else:
-        gpubenchmark(cugraph.utilities.convert_from_nx, G, do_renumber=do_renumber)
-
-
-def _bench_helper_scipy(gpubenchmark, N, attr_kind, create_using, method, fmt):
-    G = method(N, create_using=create_using)
-    if attr_kind:
-        for *_ids, edgedict in G.edges(data=True):
-            edgedict["x"] = random.randint(0, 100000)
-        gpubenchmark(nx.to_scipy_sparse_array, G, weight="x", format=fmt)
-    else:
-        gpubenchmark(nx.to_scipy_sparse_array, G, weight=None, format=fmt)
-
-
-@pytest.mark.parametrize("N", [1, 10**6])
-@pytest.mark.parametrize(
-    "attr_kind",
-    [
-        "required_dtype",
-        "required",
-        "full",
-        "half_missing",
-        "half_default",
-        "preserve",
-        None,
-    ],
-)
-@pytest.mark.parametrize("create_using", CREATE_USING)
-def bench_cycle_graph(gpubenchmark, N, attr_kind, create_using):
-    _bench_helper(gpubenchmark, N, attr_kind, create_using, nx.cycle_graph)
-
-
-@pytest.mark.skipif("not cugraph")
-@pytest.mark.parametrize("N", [1, 10**6])
-@pytest.mark.parametrize("attr_kind", ["full", None])
-@pytest.mark.parametrize("create_using", CREATE_USING)
-@pytest.mark.parametrize("do_renumber", [True, False])
-def bench_cycle_graph_cugraph(gpubenchmark, N, attr_kind, create_using, do_renumber):
-    if N == 1 and not do_renumber:
-        do_renumber = True
-    _bench_helper_cugraph(
-        gpubenchmark, N, attr_kind, create_using, nx.cycle_graph, do_renumber
-    )
-
-
-@pytest.mark.skipif("not scipy")
-@pytest.mark.parametrize("N", [1, 10**6])
-@pytest.mark.parametrize("attr_kind", ["full", None])
-@pytest.mark.parametrize("create_using", CREATE_USING)
-@pytest.mark.parametrize("fmt", ["coo", "csr"])
-def bench_cycle_graph_scipy(gpubenchmark, N, attr_kind, create_using, fmt):
-    _bench_helper_scipy(gpubenchmark, N, attr_kind, create_using, nx.cycle_graph, fmt)
-
-
-@pytest.mark.parametrize("N", [1, 1500])
-@pytest.mark.parametrize(
-    "attr_kind",
-    [
-        "required_dtype",
-        "required",
-        "full",
-        "half_missing",
-        "half_default",
-        "preserve",
-        None,
-    ],
-)
-@pytest.mark.parametrize("create_using", CREATE_USING)
-def bench_complete_graph_edgedata(gpubenchmark, N, attr_kind, create_using):
-    _bench_helper(gpubenchmark, N, attr_kind, create_using, nx.complete_graph)
-
-
-@pytest.mark.parametrize("N", [3000])
-@pytest.mark.parametrize("attr_kind", [None])
-@pytest.mark.parametrize("create_using", CREATE_USING)
-def bench_complete_graph_noedgedata(gpubenchmark, N, attr_kind, create_using):
-    _bench_helper(gpubenchmark, N, attr_kind, create_using, nx.complete_graph)
-
-
-@pytest.mark.skipif("not cugraph")
-@pytest.mark.parametrize("N", [1, 1500])
-@pytest.mark.parametrize("attr_kind", ["full", None])
-@pytest.mark.parametrize("create_using", CREATE_USING)
-@pytest.mark.parametrize("do_renumber", [True, False])
-def bench_complete_graph_cugraph(gpubenchmark, N, attr_kind, create_using, do_renumber):
-    if N == 1 and not do_renumber:
-        do_renumber = True
-    _bench_helper_cugraph(
-        gpubenchmark, N, attr_kind, create_using, nx.complete_graph, do_renumber
-    )
-
-
-@pytest.mark.skipif("not scipy")
-@pytest.mark.parametrize("N", [1, 1500])
-@pytest.mark.parametrize("attr_kind", ["full", None])
-@pytest.mark.parametrize("create_using", CREATE_USING)
-@pytest.mark.parametrize("fmt", ["coo", "csr"])
-def bench_complete_graph_scipy(gpubenchmark, N, attr_kind, create_using, fmt):
-    _bench_helper_scipy(
-        gpubenchmark, N, attr_kind, create_using, nx.complete_graph, fmt
-    )
diff --git a/python/nx-cugraph/nx_cugraph/tests/conftest.py b/python/nx-cugraph/nx_cugraph/tests/conftest.py
deleted file mode 100644
index e5a250784b2..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/conftest.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-def pytest_configure(config):
-    if config.getoption("--all", False):
-        # Run benchmarks AND tests
-        config.option.benchmark_skip = False
-        config.option.benchmark_enable = True
-    elif config.getoption("--bench", False) or config.getoption(
-        "--benchmark-enable", False
-    ):
-        # Run benchmarks (and only benchmarks) with `--bench` argument
-        config.option.benchmark_skip = False
-        config.option.benchmark_enable = True
-        if not config.option.keyword:
-            config.option.keyword = "bench_"
-    else:
-        # Run only tests
-        config.option.benchmark_skip = True
-        config.option.benchmark_enable = False
-        if not config.option.keyword:
-            config.option.keyword = "test_"
diff --git a/python/nx-cugraph/nx_cugraph/tests/ensure_algos_covered.py b/python/nx-cugraph/nx_cugraph/tests/ensure_algos_covered.py
deleted file mode 100644
index 7047f0eeafd..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/ensure_algos_covered.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Ensure that all functions wrapped by @networkx_algorithm were called.
-
-This file is run by CI and should not normally be run manually.
-"""
-import inspect
-import json
-from pathlib import Path
-
-from nx_cugraph.interface import BackendInterface
-from nx_cugraph.utils import networkx_algorithm
-
-with Path("coverage.json").open() as f:
-    coverage = json.load(f)
-
-filenames_to_executed_lines = {
-    "nx_cugraph/"
-    + filename.rsplit("nx_cugraph/", 1)[-1]: set(coverage_info["executed_lines"])
-    for filename, coverage_info in coverage["files"].items()
-}
-
-
-def unwrap(func):
-    while hasattr(func, "__wrapped__"):
-        func = func.__wrapped__
-    return func
-
-
-def get_func_filename(func):
-    return "nx_cugraph" + inspect.getfile(unwrap(func)).rsplit("nx_cugraph", 1)[-1]
-
-
-def get_func_linenos(func):
-    lines, lineno = inspect.getsourcelines(unwrap(func))
-    for i, line in enumerate(lines, lineno):
-        if ":\n" in line:
-            return set(range(i + 1, lineno + len(lines)))
-    raise RuntimeError(f"Could not determine line numbers for function {func}")
-
-
-def has_any_coverage(func):
-    return bool(
-        filenames_to_executed_lines[get_func_filename(func)] & get_func_linenos(func)
-    )
-
-
-def main():
-    no_coverage = set()
-    for attr, func in vars(BackendInterface).items():
-        if not isinstance(func, networkx_algorithm):
-            continue
-        if not has_any_coverage(func):
-            no_coverage.add(attr)
-    if no_coverage:
-        msg = "The following algorithms have no coverage: " + ", ".join(
-            sorted(no_coverage)
-        )
-        # Create a border of "!"
-        msg = (
-            "\n\n"
-            + "!" * (len(msg) + 6)
-            + "\n!! "
-            + msg
-            + " !!\n"
-            + "!" * (len(msg) + 6)
-            + "\n"
-        )
-        raise AssertionError(msg)
-    print("\nSuccess: coverage determined all algorithms were called!\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/nx-cugraph/nx_cugraph/tests/pytest.ini b/python/nx-cugraph/nx_cugraph/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_bfs.py b/python/nx-cugraph/nx_cugraph/tests/test_bfs.py
deleted file mode 100644
index ad2c62c1fb9..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_bfs.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-from nx_cugraph import _nxver
-
-if _nxver < (3, 2):
-    pytest.skip("Need NetworkX >=3.2 to test clustering", allow_module_level=True)
-
-
-def test_generic_bfs_edges():
-    # generic_bfs_edges currently isn't exercised by networkx tests
-    Gnx = nx.karate_club_graph()
-    Gcg = nx.karate_club_graph(backend="cugraph")
-    for depth_limit in (0, 1, 2):
-        for source in Gnx:
-            # Some ordering is arbitrary, so I think there's a chance
-            # this test may fail if networkx or nx-cugraph changes.
-            nx_result = nx.generic_bfs_edges(Gnx, source, depth_limit=depth_limit)
-            cg_result = nx.generic_bfs_edges(Gcg, source, depth_limit=depth_limit)
-            assert sorted(nx_result) == sorted(cg_result), (source, depth_limit)
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_classes.py b/python/nx-cugraph/nx_cugraph/tests/test_classes.py
deleted file mode 100644
index 0ac238b3558..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_classes.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import nx_cugraph as nxcg
-
-
-def test_class_to_class():
-    """Basic sanity checks to ensure metadata relating graph classes are accurate."""
-    for prefix in ["", "Cuda"]:
-        for suffix in ["Graph", "DiGraph", "MultiGraph", "MultiDiGraph"]:
-            cls_name = f"{prefix}{suffix}"
-            cls = getattr(nxcg, cls_name)
-            assert cls.__name__ == cls_name
-            G = cls()
-            assert cls is G.__class__
-            # cudagraph
-            val = cls.to_cudagraph_class()
-            val2 = G.to_cudagraph_class()
-            assert val is val2
-            assert val.__name__ == f"Cuda{suffix}"
-            assert val.__module__.startswith("nx_cugraph")
-            assert cls.is_directed() == G.is_directed() == val.is_directed()
-            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
-            # networkx
-            val = cls.to_networkx_class()
-            val2 = G.to_networkx_class()
-            assert val is val2
-            assert val.__name__ == suffix
-            assert val.__module__.startswith("networkx")
-            val = val()
-            assert cls.is_directed() == G.is_directed() == val.is_directed()
-            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
-            # directed
-            val = cls.to_directed_class()
-            val2 = G.to_directed_class()
-            assert val is val2
-            assert val.__module__.startswith("nx_cugraph")
-            assert val.is_directed()
-            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
-            if "Di" in suffix:
-                assert val is cls
-            else:
-                assert "Di" in val.__name__
-                assert prefix in val.__name__
-                assert cls.to_undirected_class() is cls
-            # undirected
-            val = cls.to_undirected_class()
-            val2 = G.to_undirected_class()
-            assert val is val2
-            assert val.__module__.startswith("nx_cugraph")
-            assert not val.is_directed()
-            assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
-            if "Di" not in suffix:
-                assert val is cls
-            else:
-                assert "Di" not in val.__name__
-                assert prefix in val.__name__
-                assert cls.to_directed_class() is cls
-            # "zero"
-            if prefix == "Cuda":
-                val = cls._to_compat_graph_class()
-                val2 = G._to_compat_graph_class()
-                assert val is val2
-                assert val.__name__ == suffix
-                assert val.__module__.startswith("nx_cugraph")
-                assert val.to_cudagraph_class() is cls
-                assert cls.is_directed() == G.is_directed() == val.is_directed()
-                assert cls.is_multigraph() == G.is_multigraph() == val.is_multigraph()
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_classes_function.py b/python/nx-cugraph/nx_cugraph/tests/test_classes_function.py
deleted file mode 100644
index d6152f650fb..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_classes_function.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Test functions from nx_cugraph/classes/function.py"""
-import networkx as nx
-
-import nx_cugraph as nxcg
-
-
-def test_is_negatively_weighted():
-    Gnx = nx.MultiGraph()
-    Gnx.add_edge(0, 1, 2, weight=-3)
-    Gnx.add_edge(2, 3, foo=3)
-    Gcg = nxcg.from_networkx(Gnx, preserve_edge_attrs=True)
-    assert nx.is_negatively_weighted(Gnx)
-    assert nxcg.is_negatively_weighted(Gnx)
-    assert nxcg.is_negatively_weighted(Gcg)
-    assert not nx.is_negatively_weighted(Gnx, weight="foo")
-    assert not nxcg.is_negatively_weighted(Gcg, weight="foo")
-    assert not nx.is_negatively_weighted(Gnx, weight="bar")
-    assert not nxcg.is_negatively_weighted(Gcg, weight="bar")
-    assert nx.is_negatively_weighted(Gnx, (0, 1, 2))
-    assert nxcg.is_negatively_weighted(Gcg, (0, 1, 2))
-    assert nx.is_negatively_weighted(Gnx, (0, 1)) == nxcg.is_negatively_weighted(
-        Gcg, (0, 1)
-    )
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_cluster.py b/python/nx-cugraph/nx_cugraph/tests/test_cluster.py
deleted file mode 100644
index fd8e1b3cf13..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_cluster.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-from nx_cugraph import _nxver
-
-if _nxver < (3, 2):
-    pytest.skip("Need NetworkX >=3.2 to test clustering", allow_module_level=True)
-
-
-def test_selfloops():
-    G = nx.complete_graph(5)
-    H = nx.complete_graph(5)
-    H.add_edge(0, 0)
-    H.add_edge(1, 1)
-    H.add_edge(2, 2)
-    # triangles
-    expected = nx.triangles(G)
-    assert expected == nx.triangles(H)
-    assert expected == nx.triangles(G, backend="cugraph")
-    assert expected == nx.triangles(H, backend="cugraph")
-    # average_clustering
-    expected = nx.average_clustering(G)
-    assert expected == nx.average_clustering(H)
-    assert expected == nx.average_clustering(G, backend="cugraph")
-    assert expected == nx.average_clustering(H, backend="cugraph")
-    # clustering
-    expected = nx.clustering(G)
-    assert expected == nx.clustering(H)
-    assert expected == nx.clustering(G, backend="cugraph")
-    assert expected == nx.clustering(H, backend="cugraph")
-    # transitivity
-    expected = nx.transitivity(G)
-    assert expected == nx.transitivity(H)
-    assert expected == nx.transitivity(G, backend="cugraph")
-    assert expected == nx.transitivity(H, backend="cugraph")
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_community.py b/python/nx-cugraph/nx_cugraph/tests/test_community.py
deleted file mode 100644
index 126f45c14ae..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_community.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-import nx_cugraph as nxcg
-
-
-def test_louvain_isolated_nodes():
-    is_nx_30_or_31 = hasattr(nx.classes, "backends")
-
-    def check(left, right):
-        assert len(left) == len(right)
-        assert set(map(frozenset, left)) == set(map(frozenset, right))
-
-    # Empty graph (no nodes)
-    G = nx.Graph()
-    if is_nx_30_or_31:
-        with pytest.raises(ZeroDivisionError):
-            nx.community.louvain_communities(G)
-    else:
-        nx_result = nx.community.louvain_communities(G)
-        cg_result = nxcg.community.louvain_communities(G)
-        check(nx_result, cg_result)
-    # Graph with no edges
-    G.add_nodes_from(range(5))
-    if is_nx_30_or_31:
-        with pytest.raises(ZeroDivisionError):
-            nx.community.louvain_communities(G)
-    else:
-        nx_result = nx.community.louvain_communities(G)
-        cg_result = nxcg.community.louvain_communities(G)
-        check(nx_result, cg_result)
-    # Graph with isolated nodes
-    G.add_edge(1, 2)
-    nx_result = nx.community.louvain_communities(G)
-    cg_result = nxcg.community.louvain_communities(G)
-    check(nx_result, cg_result)
-    # Another one
-    G.add_edge(4, 4)
-    nx_result = nx.community.louvain_communities(G)
-    cg_result = nxcg.community.louvain_communities(G)
-    check(nx_result, cg_result)
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_connected.py b/python/nx-cugraph/nx_cugraph/tests/test_connected.py
deleted file mode 100644
index fa9f283abc0..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_connected.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-
-import nx_cugraph as nxcg
-
-
-def test_connected_isolated_nodes():
-    G = nx.complete_graph(4)
-    G.add_node(max(G) + 1)
-    assert nx.is_connected(G) is False
-    assert nxcg.is_connected(G) is False
-    assert nx.number_connected_components(G) == 2
-    assert nxcg.number_connected_components(G) == 2
-    assert sorted(nx.connected_components(G)) == [{0, 1, 2, 3}, {4}]
-    assert sorted(nxcg.connected_components(G)) == [{0, 1, 2, 3}, {4}]
-    assert nx.node_connected_component(G, 0) == {0, 1, 2, 3}
-    assert nxcg.node_connected_component(G, 0) == {0, 1, 2, 3}
-    assert nx.node_connected_component(G, 4) == {4}
-    assert nxcg.node_connected_component(G, 4) == {4}
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert.py b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
deleted file mode 100644
index 3d109af8a74..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_convert.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import pytest
-
-import nx_cugraph as nxcg
-from nx_cugraph import interface
-
-
-@pytest.mark.parametrize(
-    "graph_class", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
-)
-@pytest.mark.parametrize(
-    "kwargs",
-    [
-        {},
-        {"preserve_edge_attrs": True},
-        {"preserve_node_attrs": True},
-        {"preserve_all_attrs": True},
-        {"edge_attrs": {"x": 0}},
-        {"edge_attrs": {"x": None}},
-        {"edge_attrs": {"x": nxcg.convert.REQUIRED}},
-        {"edge_attrs": {"x": ...}},  # sugar for REQUIRED
-        {"edge_attrs": "x"},
-        {"node_attrs": {"x": 0}},
-        {"node_attrs": {"x": None}},
-        {"node_attrs": {"x": nxcg.convert.REQUIRED}},
-        {"node_attrs": {"x": ...}},  # sugar for REQUIRED
-        {"node_attrs": "x"},
-    ],
-)
-def test_convert_empty(graph_class, kwargs):
-    G = graph_class()
-    Gcg = nxcg.from_networkx(G, **kwargs)
-    H = nxcg.to_networkx(Gcg)
-    assert G.number_of_nodes() == Gcg.number_of_nodes() == H.number_of_nodes() == 0
-    assert G.number_of_edges() == Gcg.number_of_edges() == H.number_of_edges() == 0
-    assert Gcg.edge_values == Gcg.edge_masks == Gcg.node_values == Gcg.node_masks == {}
-    assert G.graph == Gcg.graph == H.graph == {}
-
-
-@pytest.mark.parametrize("graph_class", [nx.Graph, nx.MultiGraph])
-def test_convert(graph_class):
-    # FIXME: can we break this into smaller tests?
-    G = graph_class()
-    G.add_edge(0, 1, x=2)
-    G.add_node(0, foo=10)
-    G.add_node(1, foo=20, bar=100)
-    for kwargs in [
-        {"preserve_edge_attrs": True},
-        {"preserve_all_attrs": True},
-        {"edge_attrs": {"x": 0}},
-        {"edge_attrs": {"x": None}, "node_attrs": {"bar": None}},
-        {"edge_attrs": "x", "edge_dtypes": int},
-        {
-            "edge_attrs": {"x": nxcg.convert.REQUIRED},
-            "node_attrs": {"foo": nxcg.convert.REQUIRED},
-        },
-        {"edge_attrs": {"x": ...}, "node_attrs": {"foo": ...}},  # sugar for REQUIRED
-    ]:
-        # All edges have "x" attribute, so all kwargs are equivalent
-        Gcg = nxcg.from_networkx(G, **kwargs)
-        cp.testing.assert_array_equal(Gcg.src_indices, [0, 1])
-        cp.testing.assert_array_equal(Gcg.dst_indices, [1, 0])
-        cp.testing.assert_array_equal(Gcg.edge_values["x"], [2, 2])
-        assert len(Gcg.edge_values) == 1
-        assert Gcg.edge_masks == {}
-        H = nxcg.to_networkx(Gcg)
-        assert G.number_of_nodes() == Gcg.number_of_nodes() == H.number_of_nodes() == 2
-        assert G.number_of_edges() == Gcg.number_of_edges() == H.number_of_edges() == 1
-        assert G.adj == H.adj
-
-    with pytest.raises(KeyError, match="bar"):
-        nxcg.from_networkx(G, node_attrs={"bar": ...})
-
-    # Structure-only graph (no edge attributes)
-    Gcg = nxcg.from_networkx(G, preserve_node_attrs=True)
-    cp.testing.assert_array_equal(Gcg.src_indices, [0, 1])
-    cp.testing.assert_array_equal(Gcg.dst_indices, [1, 0])
-    cp.testing.assert_array_equal(Gcg.node_values["foo"], [10, 20])
-    assert Gcg.edge_values == Gcg.edge_masks == {}
-    H = nxcg.to_networkx(Gcg)
-    if G.is_multigraph():
-        assert set(G.edges) == set(H.edges) == {(0, 1, 0)}
-    else:
-        assert set(G.edges) == set(H.edges) == {(0, 1)}
-    assert G.nodes == H.nodes
-
-    # Fill completely missing attribute with default value
-    Gcg = nxcg.from_networkx(G, edge_attrs={"y": 0})
-    cp.testing.assert_array_equal(Gcg.src_indices, [0, 1])
-    cp.testing.assert_array_equal(Gcg.dst_indices, [1, 0])
-    cp.testing.assert_array_equal(Gcg.edge_values["y"], [0, 0])
-    assert len(Gcg.edge_values) == 1
-    assert Gcg.edge_masks == Gcg.node_values == Gcg.node_masks == {}
-    H = nxcg.to_networkx(Gcg)
-    assert list(H.edges(data=True)) == [(0, 1, {"y": 0})]
-    if Gcg.is_multigraph():
-        assert set(H.edges) == {(0, 1, 0)}
-
-    # If attribute is completely missing (and no default), then just ignore it
-    Gcg = nxcg.from_networkx(G, edge_attrs={"y": None})
-    cp.testing.assert_array_equal(Gcg.src_indices, [0, 1])
-    cp.testing.assert_array_equal(Gcg.dst_indices, [1, 0])
-    assert sorted(Gcg.edge_values) == sorted(Gcg.edge_masks) == []
-    H = nxcg.to_networkx(Gcg)
-    assert list(H.edges(data=True)) == [(0, 1, {})]
-    if Gcg.is_multigraph():
-        assert set(H.edges) == {(0, 1, 0)}
-
-    G.add_edge(0, 2)
-    # Some edges are missing 'x' attribute; need to use a mask
-    for kwargs in [{"preserve_edge_attrs": True}, {"edge_attrs": {"x": None}}]:
-        Gcg = nxcg.from_networkx(G, **kwargs)
-        cp.testing.assert_array_equal(Gcg.src_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(Gcg.dst_indices, [1, 2, 0, 0])
-        assert sorted(Gcg.edge_values) == sorted(Gcg.edge_masks) == ["x"]
-        cp.testing.assert_array_equal(Gcg.edge_masks["x"], [True, False, True, False])
-        cp.testing.assert_array_equal(Gcg.edge_values["x"][Gcg.edge_masks["x"]], [2, 2])
-    H = nxcg.to_networkx(Gcg)
-    assert list(H.edges(data=True)) == [(0, 1, {"x": 2}), (0, 2, {})]
-    if Gcg.is_multigraph():
-        assert set(H.edges) == {(0, 1, 0), (0, 2, 0)}
-
-    with pytest.raises(KeyError, match="x"):
-        nxcg.from_networkx(G, edge_attrs={"x": nxcg.convert.REQUIRED})
-    with pytest.raises(KeyError, match="x"):
-        nxcg.from_networkx(G, edge_attrs={"x": ...})
-    with pytest.raises(KeyError, match="bar"):
-        nxcg.from_networkx(G, node_attrs={"bar": nxcg.convert.REQUIRED})
-    with pytest.raises(KeyError, match="bar"):
-        nxcg.from_networkx(G, node_attrs={"bar": ...})
-
-    # Now for something more complicated...
-    G = graph_class()
-    G.add_edge(10, 20, x=1)
-    G.add_edge(10, 30, x=2, y=1.5)
-    G.add_node(10, foo=100)
-    G.add_node(20, foo=200, bar=1000)
-    G.add_node(30, foo=300)
-    # Some edges have masks, some don't
-    for kwargs in [
-        {"preserve_edge_attrs": True},
-        {"preserve_all_attrs": True},
-        {"edge_attrs": {"x": None, "y": None}},
-        {"edge_attrs": {"x": 0, "y": None}},
-        {"edge_attrs": {"x": 0, "y": None}},
-        {"edge_attrs": {"x": 0, "y": None}, "edge_dtypes": {"x": int, "y": float}},
-    ]:
-        Gcg = nxcg.from_networkx(G, **kwargs)
-        assert Gcg.id_to_key == [10, 20, 30]  # Remap node IDs to 0, 1, ...
-        cp.testing.assert_array_equal(Gcg.src_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(Gcg.dst_indices, [1, 2, 0, 0])
-        cp.testing.assert_array_equal(Gcg.edge_values["x"], [1, 2, 1, 2])
-        assert sorted(Gcg.edge_masks) == ["y"]
-        cp.testing.assert_array_equal(Gcg.edge_masks["y"], [False, True, False, True])
-        cp.testing.assert_array_equal(
-            Gcg.edge_values["y"][Gcg.edge_masks["y"]], [1.5, 1.5]
-        )
-        H = nxcg.to_networkx(Gcg)
-        assert G.adj == H.adj
-
-    # Some nodes have masks, some don't
-    for kwargs in [
-        {"preserve_node_attrs": True},
-        {"preserve_all_attrs": True},
-        {"node_attrs": {"foo": None, "bar": None}},
-        {"node_attrs": {"foo": None, "bar": None}},
-        {"node_attrs": {"foo": 0, "bar": None, "missing": None}},
-    ]:
-        Gcg = nxcg.from_networkx(G, **kwargs)
-        assert Gcg.id_to_key == [10, 20, 30]  # Remap node IDs to 0, 1, ...
-        cp.testing.assert_array_equal(Gcg.src_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(Gcg.dst_indices, [1, 2, 0, 0])
-        cp.testing.assert_array_equal(Gcg.node_values["foo"], [100, 200, 300])
-        assert sorted(Gcg.node_masks) == ["bar"]
-        cp.testing.assert_array_equal(Gcg.node_masks["bar"], [False, True, False])
-        cp.testing.assert_array_equal(
-            Gcg.node_values["bar"][Gcg.node_masks["bar"]], [1000]
-        )
-        H = nxcg.to_networkx(Gcg)
-        assert G.nodes == H.nodes
-
-    # Check default values for nodes
-    for kwargs in [
-        {"node_attrs": {"foo": None, "bar": 0}},
-        {"node_attrs": {"foo": None, "bar": 0, "missing": None}},
-        {"node_attrs": {"bar": 0}},
-        {"node_attrs": {"bar": 0}, "node_dtypes": {"bar": int}},
-        {"node_attrs": {"bar": 0, "foo": None}, "node_dtypes": int},
-    ]:
-        Gcg = nxcg.from_networkx(G, **kwargs)
-        assert Gcg.id_to_key == [10, 20, 30]  # Remap node IDs to 0, 1, ...
-        cp.testing.assert_array_equal(Gcg.src_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(Gcg.dst_indices, [1, 2, 0, 0])
-        cp.testing.assert_array_equal(Gcg.node_values["bar"], [0, 1000, 0])
-        assert Gcg.node_masks == {}
-
-    with pytest.raises(
-        TypeError, match="edge_attrs and weight arguments should not both be given"
-    ):
-        interface.BackendInterface.convert_from_nx(G, edge_attrs={"x": 1}, weight="x")
-    with pytest.raises(TypeError, match="Expected networkx.Graph"):
-        nxcg.from_networkx({})
-
-
-@pytest.mark.parametrize("graph_class", [nx.MultiGraph, nx.MultiDiGraph])
-def test_multigraph(graph_class):
-    G = graph_class()
-    G.add_edge(0, 1, "key1", x=10)
-    G.add_edge(0, 1, "key2", y=20)
-    Gcg = nxcg.from_networkx(G, preserve_edge_attrs=True)
-    H = nxcg.to_networkx(Gcg)
-    assert type(G) is type(H)
-    assert nx.utils.graphs_equal(G, H)
-
-
-def test_to_dict_of_lists():
-    G = nx.MultiGraph()
-    G.add_edge("a", "b")
-    G.add_edge("a", "c")
-    G.add_edge("a", "b")
-    expected = nx.to_dict_of_lists(G)
-    result = nxcg.to_dict_of_lists(G)
-    assert expected == result
-    expected = nx.to_dict_of_lists(G, nodelist=["a", "b"])
-    result = nxcg.to_dict_of_lists(G, nodelist=["a", "b"])
-    assert expected == result
-    with pytest.raises(nx.NetworkXError, match="The node d is not in the graph"):
-        nx.to_dict_of_lists(G, nodelist=["a", "d"])
-    with pytest.raises(nx.NetworkXError, match="The node d is not in the graph"):
-        nxcg.to_dict_of_lists(G, nodelist=["a", "d"])
-    G.add_node("d")  # No edges
-    expected = nx.to_dict_of_lists(G)
-    result = nxcg.to_dict_of_lists(G)
-    assert expected == result
-    expected = nx.to_dict_of_lists(G, nodelist=["a", "d"])
-    result = nxcg.to_dict_of_lists(G, nodelist=["a", "d"])
-    assert expected == result
-    # Now try with default node ids
-    G = nx.DiGraph()
-    G.add_edge(0, 1)
-    G.add_edge(0, 2)
-    expected = nx.to_dict_of_lists(G)
-    result = nxcg.to_dict_of_lists(G)
-    assert expected == result
-    expected = nx.to_dict_of_lists(G, nodelist=[0, 1])
-    result = nxcg.to_dict_of_lists(G, nodelist=[0, 1])
-    assert expected == result
-    with pytest.raises(nx.NetworkXError, match="The node 3 is not in the digraph"):
-        nx.to_dict_of_lists(G, nodelist=[0, 3])
-    with pytest.raises(nx.NetworkXError, match="The node 3 is not in the digraph"):
-        nxcg.to_dict_of_lists(G, nodelist=[0, 3])
-    G.add_node(3)  # No edges
-    expected = nx.to_dict_of_lists(G)
-    result = nxcg.to_dict_of_lists(G)
-    assert expected == result
-    expected = nx.to_dict_of_lists(G, nodelist=[0, 3])
-    result = nxcg.to_dict_of_lists(G, nodelist=[0, 3])
-    assert expected == result
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py
deleted file mode 100644
index 0a9cc087ce0..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pandas as pd
-import pytest
-
-import nx_cugraph as nxcg
-from nx_cugraph.utils import _cp_iscopied_asarray
-
-try:
-    import cudf
-except ModuleNotFoundError:
-    cudf = None
-
-
-DATA = [
-    {"source": [0, 1], "target": [1, 2]},  # nodes are 0, 1, 2
-    {"source": [0, 1], "target": [1, 3]},  # nodes are 0, 1, 3 (need renumbered!)
-    {"source": ["a", "b"], "target": ["b", "c"]},  # nodes are 'a', 'b', 'c'
-]
-CREATE_USING = [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
-
-
-@pytest.mark.skipif("not cudf")
-@pytest.mark.parametrize("data", DATA)
-@pytest.mark.parametrize("create_using", CREATE_USING)
-def test_from_cudf_edgelist(data, create_using):
-    df = cudf.DataFrame(data)
-    nxcg.from_pandas_edgelist(df, create_using=create_using)  # Basic smoke test
-    source = df["source"]
-    if source.dtype == int:
-        is_copied, src_array = _cp_iscopied_asarray(source)
-        assert is_copied is False
-        is_copied, src_array = _cp_iscopied_asarray(source.to_cupy())
-        assert is_copied is False
-        is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source)
-        assert is_copied is False
-        is_copied, src_array = _cp_iscopied_asarray(
-            source.to_cupy(), orig_object=source
-        )
-        assert is_copied is False
-        # to numpy
-        is_copied, src_array = _cp_iscopied_asarray(source.to_numpy())
-        assert is_copied is True
-        is_copied, src_array = _cp_iscopied_asarray(
-            source.to_numpy(), orig_object=source
-        )
-        assert is_copied is True
-    else:
-        with pytest.raises(TypeError):
-            _cp_iscopied_asarray(source)
-        with pytest.raises(TypeError):
-            _cp_iscopied_asarray(source.to_cupy())
-        with pytest.raises(ValueError, match="Unsupported dtype"):
-            _cp_iscopied_asarray(source.to_numpy())
-        with pytest.raises(ValueError, match="Unsupported dtype"):
-            _cp_iscopied_asarray(source.to_numpy(), orig_object=source)
-
-
-@pytest.mark.parametrize("data", DATA)
-@pytest.mark.parametrize("create_using", CREATE_USING)
-def test_from_pandas_edgelist(data, create_using):
-    df = pd.DataFrame(data)
-    nxcg.from_pandas_edgelist(df, create_using=create_using)  # Basic smoke test
-    source = df["source"]
-    if source.dtype == int:
-        is_copied, src_array = _cp_iscopied_asarray(source)
-        assert is_copied is True
-        is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source)
-        assert is_copied is True
-        is_copied, src_array = _cp_iscopied_asarray(source.to_numpy())
-        assert is_copied is True
-        is_copied, src_array = _cp_iscopied_asarray(
-            source.to_numpy(), orig_object=source
-        )
-        assert is_copied is True
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py b/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
deleted file mode 100644
index f3d0a8d3767..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_ego_graph.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from .testing_utils import assert_graphs_equal
-
-if _nxver < (3, 2):
-    pytest.skip("Need NetworkX >=3.2 to test ego_graph", allow_module_level=True)
-
-
-@pytest.mark.parametrize(
-    "create_using", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
-)
-@pytest.mark.parametrize("radius", [-1, 0, 1, 1.5, 2, float("inf"), None])
-@pytest.mark.parametrize("center", [True, False])
-@pytest.mark.parametrize("undirected", [False, True])
-@pytest.mark.parametrize("multiple_edges", [False, True])
-@pytest.mark.parametrize("n", [0, 3])
-def test_ego_graph_cycle_graph(
-    create_using, radius, center, undirected, multiple_edges, n
-):
-    Gnx = nx.cycle_graph(7, create_using=create_using)
-    if multiple_edges:
-        # Test multigraph with multiple edges
-        if not Gnx.is_multigraph():
-            return
-        Gnx.add_edges_from(nx.cycle_graph(7, create_using=nx.DiGraph).edges)
-        Gnx.add_edge(0, 1, 10)
-    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
-    assert_graphs_equal(Gnx, Gcg)  # Sanity check
-
-    kwargs = {"radius": radius, "center": center, "undirected": undirected}
-    Hnx = nx.ego_graph(Gnx, n, **kwargs)
-    Hcg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
-    use_compat_graphs = _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs
-    assert_graphs_equal(Hnx, Hcg._cudagraph if use_compat_graphs else Hcg)
-    Hcg = nx.ego_graph(Gcg, n, **kwargs)
-    assert_graphs_equal(Hnx, Hcg)
-    Hcg = nx.ego_graph(Gcg._to_compat_graph(), n, **kwargs)
-    assert_graphs_equal(Hnx, Hcg._cudagraph)
-    with pytest.raises(nx.NodeNotFound, match="not in G"):
-        nx.ego_graph(Gnx, -1, **kwargs)
-    with pytest.raises(nx.NodeNotFound, match="not in G"):
-        nx.ego_graph(Gnx, -1, **kwargs, backend="cugraph")
-    # Using sssp with default weight of 1 should give same answer as bfs
-    nx.set_edge_attributes(Gnx, 1, name="weight")
-    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
-    assert_graphs_equal(Gnx, Gcg)  # Sanity check
-
-    kwargs["distance"] = "weight"
-    H2nx = nx.ego_graph(Gnx, n, **kwargs)
-    is_nx32 = _nxver[:2] == (3, 2)
-    if undirected and Gnx.is_directed() and Gnx.is_multigraph():
-        if is_nx32:
-            # `should_run` was added in nx 3.3
-            match = "Weighted ego_graph with undirected=True not implemented"
-        elif _nxver >= (3, 4):
-            match = "not implemented by 'cugraph'"
-        else:
-            match = "not implemented by cugraph"
-        with pytest.raises(
-            RuntimeError if _nxver < (3, 4) else NotImplementedError, match=match
-        ):
-            nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
-        with pytest.raises(NotImplementedError, match="ego_graph"):
-            nx.ego_graph(Gcg, n, **kwargs, backend="cugraph")
-        if _nxver < (3, 4) or not nx.config.fallback_to_nx:
-            with pytest.raises(NotImplementedError, match="ego_graph"):
-                nx.ego_graph(Gcg, n, **kwargs)
-        else:
-            # This is an interesting case. `nxcg.ego_graph` is not implemented for
-            # these arguments, so it falls back to networkx. Hence, as it is currently
-            # implemented, the input graph is `nxcg.CudaGraph`, but the output graph
-            # is `nx.Graph`. Should networkx convert back to "cugraph" backend?
-            H2cg = nx.ego_graph(Gcg, n, **kwargs)
-            assert type(H2nx) is type(H2cg)
-            assert_graphs_equal(H2nx, nxcg.from_networkx(H2cg, preserve_all_attrs=True))
-    else:
-        H2cg = nx.ego_graph(Gnx, n, **kwargs, backend="cugraph")
-        assert_graphs_equal(H2nx, H2cg._cudagraph if use_compat_graphs else H2cg)
-        with pytest.raises(nx.NodeNotFound, match="not found in graph"):
-            nx.ego_graph(Gnx, -1, **kwargs)
-        with pytest.raises(nx.NodeNotFound, match="not found in graph"):
-            nx.ego_graph(Gnx, -1, **kwargs, backend="cugraph")
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_generators.py b/python/nx-cugraph/nx_cugraph/tests/test_generators.py
deleted file mode 100644
index 5c405f1c93b..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_generators.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import numpy as np
-import pytest
-
-import nx_cugraph as nxcg
-from nx_cugraph import _nxver
-
-from .testing_utils import assert_graphs_equal
-
-if _nxver < (3, 2):
-    pytest.skip("Need NetworkX >=3.2 to test generators", allow_module_level=True)
-
-
-def compare(name, create_using, *args, is_vanilla=False):
-    exc1 = exc2 = None
-    func = getattr(nx, name)
-    if isinstance(create_using, nxcg.CudaGraph):
-        nx_create_using = nxcg.to_networkx(create_using)
-    elif isinstance(create_using, type) and issubclass(
-        create_using, (nxcg.Graph, nxcg.CudaGraph)
-    ):
-        nx_create_using = create_using.to_networkx_class()
-    elif isinstance(create_using, nx.Graph):
-        nx_create_using = create_using.copy()
-    else:
-        nx_create_using = create_using
-    try:
-        if is_vanilla:
-            G = func(*args)
-        else:
-            G = func(*args, create_using=nx_create_using)
-    except Exception as exc:
-        exc1 = exc
-    try:
-        if is_vanilla:
-            Gcg = func(*args, backend="cugraph")
-        else:
-            Gcg = func(*args, create_using=create_using, backend="cugraph")
-    except ZeroDivisionError:
-        raise
-    except NotImplementedError as exc:
-        if name in {"complete_multipartite_graph"}:  # nx.__version__[:3] <= "3.2"
-            return
-        exc2 = exc
-    except Exception as exc:
-        if exc1 is None:  # pragma: no cover (debug)
-            raise
-        exc2 = exc
-    if exc1 is not None or exc2 is not None:
-        assert type(exc1) is type(exc2)
-        return
-    if isinstance(Gcg, nxcg.Graph):
-        # If the graph is empty, it may be on host, otherwise it should be on device
-        if len(G):
-            assert Gcg._is_on_gpu
-            assert not Gcg._is_on_cpu
-        assert_graphs_equal(G, Gcg._cudagraph)
-    else:
-        assert_graphs_equal(G, Gcg)
-    # Ensure the output type is correct
-    if is_vanilla:
-        if _nxver < (3, 3) or nx.config.backends.cugraph.use_compat_graphs:
-            assert isinstance(Gcg, nxcg.Graph)
-        else:
-            assert isinstance(Gcg, nxcg.CudaGraph)
-    elif isinstance(create_using, type) and issubclass(
-        create_using, (nxcg.Graph, nxcg.CudaGraph)
-    ):
-        assert type(Gcg) is create_using
-    elif isinstance(create_using, (nxcg.Graph, nxcg.CudaGraph)):
-        assert type(Gcg) is type(create_using)
-
-
-N = list(range(-1, 5))
-CREATE_USING = [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
-COMPLETE_CREATE_USING = [
-    nx.Graph,
-    nx.DiGraph,
-    nx.MultiGraph,
-    nx.MultiDiGraph,
-    nxcg.Graph,
-    nxcg.DiGraph,
-    nxcg.MultiGraph,
-    nxcg.MultiDiGraph,
-    nxcg.CudaGraph,
-    nxcg.CudaDiGraph,
-    nxcg.CudaMultiGraph,
-    nxcg.CudaMultiDiGraph,
-    # These raise NotImplementedError
-    # nx.Graph(),
-    # nx.DiGraph(),
-    # nx.MultiGraph(),
-    # nx.MultiDiGraph(),
-    nxcg.Graph(),
-    nxcg.DiGraph(),
-    nxcg.MultiGraph(),
-    nxcg.MultiDiGraph(),
-    nxcg.CudaGraph(),
-    nxcg.CudaDiGraph(),
-    nxcg.CudaMultiGraph(),
-    nxcg.CudaMultiDiGraph(),
-    None,
-    object,  # Bad input
-    7,  # Bad input
-]
-GENERATORS_NOARG = [
-    # classic
-    "null_graph",
-    "trivial_graph",
-    # small
-    "bull_graph",
-    "chvatal_graph",
-    "cubical_graph",
-    "desargues_graph",
-    "diamond_graph",
-    "dodecahedral_graph",
-    "frucht_graph",
-    "heawood_graph",
-    "house_graph",
-    "house_x_graph",
-    "icosahedral_graph",
-    "krackhardt_kite_graph",
-    "moebius_kantor_graph",
-    "octahedral_graph",
-    "petersen_graph",
-    "sedgewick_maze_graph",
-    "tetrahedral_graph",
-    "truncated_cube_graph",
-    "truncated_tetrahedron_graph",
-    "tutte_graph",
-]
-GENERATORS_NOARG_VANILLA = [
-    # classic
-    "complete_multipartite_graph",
-    # small
-    "pappus_graph",
-    # social
-    "davis_southern_women_graph",
-    "florentine_families_graph",
-    "karate_club_graph",
-    "les_miserables_graph",
-]
-GENERATORS_N = [
-    # classic
-    "circular_ladder_graph",
-    "complete_graph",
-    "cycle_graph",
-    "empty_graph",
-    "ladder_graph",
-    "path_graph",
-    "star_graph",
-    "wheel_graph",
-]
-GENERATORS_M_N = [
-    # classic
-    "barbell_graph",
-    "lollipop_graph",
-    "tadpole_graph",
-    # bipartite
-    "complete_bipartite_graph",
-]
-GENERATORS_M_N_VANILLA = [
-    # classic
-    "complete_multipartite_graph",
-    "turan_graph",
-    # community
-    "caveman_graph",
-]
-
-
-@pytest.mark.parametrize("name", GENERATORS_NOARG)
-@pytest.mark.parametrize("create_using", COMPLETE_CREATE_USING)
-def test_generator_noarg(name, create_using):
-    print(name, create_using, type(create_using))
-    if isinstance(create_using, nxcg.CudaGraph) and name in {
-        # fmt: off
-        "bull_graph", "chvatal_graph", "cubical_graph", "diamond_graph",
-        "house_graph", "house_x_graph", "icosahedral_graph", "krackhardt_kite_graph",
-        "octahedral_graph", "petersen_graph", "truncated_cube_graph", "tutte_graph",
-        # fmt: on
-    }:
-        # The _raise_on_directed decorator used in networkx doesn't like our graphs.
-        if create_using.is_directed():
-            with pytest.raises(AssertionError):
-                compare(name, create_using)
-        else:
-            with pytest.raises(TypeError):
-                compare(name, create_using)
-    else:
-        compare(name, create_using)
-
-
-@pytest.mark.parametrize("name", GENERATORS_NOARG_VANILLA)
-def test_generator_noarg_vanilla(name):
-    print(name)
-    compare(name, None, is_vanilla=True)
-
-
-@pytest.mark.parametrize("name", GENERATORS_N)
-@pytest.mark.parametrize("n", N)
-@pytest.mark.parametrize("create_using", CREATE_USING)
-def test_generator_n(name, n, create_using):
-    print(name, n, create_using)
-    compare(name, create_using, n)
-
-
-@pytest.mark.parametrize("name", GENERATORS_N)
-@pytest.mark.parametrize("n", [1, 4])
-@pytest.mark.parametrize("create_using", COMPLETE_CREATE_USING)
-def test_generator_n_complete(name, n, create_using):
-    print(name, n, create_using)
-    compare(name, create_using, n)
-
-
-@pytest.mark.parametrize("name", GENERATORS_M_N)
-@pytest.mark.parametrize("create_using", CREATE_USING)
-@pytest.mark.parametrize("m", N)
-@pytest.mark.parametrize("n", N)
-def test_generator_m_n(name, create_using, m, n):
-    print(name, m, n, create_using)
-    compare(name, create_using, m, n)
-
-
-@pytest.mark.parametrize("name", GENERATORS_M_N_VANILLA)
-@pytest.mark.parametrize("m", N)
-@pytest.mark.parametrize("n", N)
-def test_generator_m_n_vanilla(name, m, n):
-    print(name, m, n)
-    compare(name, None, m, n, is_vanilla=True)
-
-
-@pytest.mark.parametrize("name", GENERATORS_M_N)
-@pytest.mark.parametrize("create_using", COMPLETE_CREATE_USING)
-@pytest.mark.parametrize("m", [4])
-@pytest.mark.parametrize("n", [4])
-def test_generator_m_n_complete(name, create_using, m, n):
-    print(name, m, n, create_using)
-    compare(name, create_using, m, n)
-
-
-@pytest.mark.parametrize("name", GENERATORS_M_N_VANILLA)
-@pytest.mark.parametrize("m", [4])
-@pytest.mark.parametrize("n", [4])
-def test_generator_m_n_complete_vanilla(name, m, n):
-    print(name, m, n)
-    compare(name, None, m, n, is_vanilla=True)
-
-
-def test_bad_lollipop_graph():
-    compare("lollipop_graph", None, [0, 1], [1, 2])
-
-
-def test_can_convert_karate_club():
-    # Karate club graph has string node values.
-    # This really tests conversions, but it's here so we can use `assert_graphs_equal`.
-    G = nx.karate_club_graph()
-    G.add_node(0, foo="bar")  # string dtype with a mask
-    G.add_node(1, object=object())  # haha
-    Gcg = nxcg.from_networkx(G, preserve_all_attrs=True)
-    assert_graphs_equal(G, Gcg)
-    Gnx = nxcg.to_networkx(Gcg)
-    assert nx.utils.graphs_equal(G, Gnx)
-    assert isinstance(Gcg.node_values["club"], np.ndarray)
-    assert Gcg.node_values["club"].dtype.kind == "U"
-    assert isinstance(Gcg.node_values["foo"], np.ndarray)
-    assert isinstance(Gcg.node_masks["foo"], np.ndarray)
-    assert Gcg.node_values["foo"].dtype.kind == "U"
-    assert isinstance(Gcg.node_values["object"], np.ndarray)
-    assert Gcg.node_values["object"].dtype.kind == "O"
-    assert isinstance(Gcg.node_masks["object"], np.ndarray)
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py b/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py
deleted file mode 100644
index 40a361b1084..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-import nx_cugraph as nxcg
-
-from .testing_utils import assert_graphs_equal
-
-
-def _create_Gs():
-    rv = []
-    rv.append(nx.DiGraph())
-    G = nx.DiGraph()
-    G.add_edge(0, 1)
-    G.add_edge(1, 0)
-    rv.append(G)
-    G = G.copy()
-    G.add_edge(0, 2)
-    rv.append(G)
-    G = G.copy()
-    G.add_edge(1, 1)
-    rv.append(G)
-    G = nx.DiGraph()
-    G.add_edge(0, 1, x=1, y=2)
-    G.add_edge(1, 0, x=10, z=3)
-    rv.append(G)
-    G = G.copy()
-    G.add_edge(0, 2, a=42)
-    rv.append(G)
-    G = G.copy()
-    G.add_edge(1, 1, a=4)
-    rv.append(G)
-    return rv
-
-
-@pytest.mark.parametrize("Gnx", _create_Gs())
-@pytest.mark.parametrize("reciprocal", [False, True])
-def test_to_undirected_directed(Gnx, reciprocal):
-    Gcg = nxcg.CudaDiGraph(Gnx)
-    assert_graphs_equal(Gnx, Gcg)
-    Hnx1 = Gnx.to_undirected(reciprocal=reciprocal)
-    Hcg1 = Gcg.to_undirected(reciprocal=reciprocal)
-    assert_graphs_equal(Hnx1, Hcg1)
-    Hnx2 = Hnx1.to_directed()
-    Hcg2 = Hcg1.to_directed()
-    assert_graphs_equal(Hnx2, Hcg2)
-
-
-def test_multidigraph_to_undirected():
-    Gnx = nx.MultiDiGraph()
-    Gnx.add_edge(0, 1)
-    Gnx.add_edge(0, 1)
-    Gnx.add_edge(1, 0)
-    Gcg = nxcg.CudaMultiDiGraph(Gnx)
-    with pytest.raises(NotImplementedError):
-        Gcg.to_undirected()
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py b/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py
deleted file mode 100644
index 92fe2360688..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-import nx_cugraph as nxcg
-
-
-@pytest.mark.parametrize(
-    "get_graph", [nx.florentine_families_graph, nx.les_miserables_graph]
-)
-def test_k_truss(get_graph):
-    Gnx = get_graph()
-    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
-    for k in range(6):
-        Hnx = nx.k_truss(Gnx, k)
-        Hcg = nxcg.k_truss(Gcg, k)
-        assert nx.utils.graphs_equal(Hnx, nxcg.to_networkx(Hcg))
-        if Hnx.number_of_edges() == 0:
-            break
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
deleted file mode 100644
index 1a61c69b3e7..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
-import inspect
-
-import networkx as nx
-
-import nx_cugraph as nxcg
-from nx_cugraph.utils import networkx_algorithm
-
-
-def test_match_signature_and_names():
-    """Simple test to ensure our signatures and basic module layout match networkx."""
-    for name, func in vars(nxcg.interface.BackendInterface).items():
-        if not isinstance(func, networkx_algorithm):
-            continue
-
-        # nx version >=3.2 uses utils.backends, version >=3.0,<3.2 uses classes.backends
-        is_nx_30_or_31 = hasattr(nx.classes, "backends")
-        nx_backends = nx.classes.backends if is_nx_30_or_31 else nx.utils.backends
-
-        if is_nx_30_or_31 and name in {"louvain_communities"}:
-            continue
-        if name not in nx_backends._registered_algorithms:
-            print(f"{name} not dispatched from networkx")
-            continue
-        dispatchable_func = nx_backends._registered_algorithms[name]
-        # nx version >=3.2 uses orig_func, version >=3.0,<3.2 uses _orig_func
-        if is_nx_30_or_31:
-            orig_func = dispatchable_func._orig_func
-        else:
-            orig_func = dispatchable_func.orig_func
-
-        # Matching signatures?
-        orig_sig = inspect.signature(orig_func)
-        func_sig = inspect.signature(func)
-        if not func.extra_params:
-            assert orig_sig == func_sig, name
-        else:
-            # Ignore extra parameters added to nx-cugraph algorithm
-            # The key of func.extra_params may be like "max_level : int, optional",
-            # but we only want "max_level" here.
-            extra_params = {name.split(" ")[0] for name in func.extra_params}
-            assert orig_sig == func_sig.replace(
-                parameters=[
-                    p
-                    for name, p in func_sig.parameters.items()
-                    if name not in extra_params
-                ]
-            ), name
-        if func.can_run is not nxcg.utils.decorators._default_can_run:
-            assert func_sig == inspect.signature(func.can_run), name
-        if func.should_run is not nxcg.utils.decorators._default_should_run:
-            assert func_sig == inspect.signature(func.should_run), name
-
-        # Matching function names?
-        assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__, name
-
-        # Matching dispatch names?
-        # nx version >=3.2 uses name, version >=3.0,<3.2 uses dispatchname
-        if is_nx_30_or_31:
-            dispatchname = dispatchable_func.dispatchname
-        else:
-            dispatchname = dispatchable_func.name
-        assert func.name == dispatchname, name
-
-        # Matching modules (i.e., where function defined)?
-        assert (
-            "networkx." + func.__module__.split(".", 1)[1]
-            == dispatchable_func.__module__
-            == orig_func.__module__
-        ), name
-
-        # Matching package layout (i.e., which modules have the function)?
-        nxcg_path = func.__module__
-        name = func.__name__
-        while "." in nxcg_path:
-            # This only walks up the module tree and does not check sibling modules
-            nxcg_path, mod_name = nxcg_path.rsplit(".", 1)
-            nx_path = nxcg_path.replace("nx_cugraph", "networkx")
-            nxcg_mod = importlib.import_module(nxcg_path)
-            nx_mod = importlib.import_module(nx_path)
-            # Is the function present in the current module?
-            present_in_nxcg = hasattr(nxcg_mod, name)
-            present_in_nx = hasattr(nx_mod, name)
-            if present_in_nxcg is not present_in_nx:  # pragma: no cover (debug)
-                if present_in_nxcg:
-                    raise AssertionError(
-                        f"{name} exists in {nxcg_path}, but not in {nx_path}"
-                    )
-                raise AssertionError(
-                    f"{name} exists in {nx_path}, but not in {nxcg_path}"
-                )
-            # Is the nested module present in the current module?
-            present_in_nxcg = hasattr(nxcg_mod, mod_name)
-            present_in_nx = hasattr(nx_mod, mod_name)
-            if present_in_nxcg is not present_in_nx:  # pragma: no cover (debug)
-                if present_in_nxcg:
-                    raise AssertionError(
-                        f"{mod_name} exists in {nxcg_path}, but not in {nx_path}"
-                    )
-                raise AssertionError(
-                    f"{mod_name} exists in {nx_path}, but not in {nxcg_path}"
-                )
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_multigraph.py b/python/nx-cugraph/nx_cugraph/tests/test_multigraph.py
deleted file mode 100644
index 9208eea09f2..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_multigraph.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-import nx_cugraph as nxcg
-
-
-@pytest.mark.parametrize("test_nxcugraph", [False, True])
-def test_get_edge_data(test_nxcugraph):
-    G = nx.MultiGraph()
-    G.add_edge(0, 1, 0, x=10)
-    G.add_edge(0, 1, 1, y=20)
-    G.add_edge(0, 2, "a", x=100)
-    G.add_edge(0, 2, "b", y=200)
-    G.add_edge(0, 3)
-    G.add_edge(0, 3)
-    if test_nxcugraph:
-        G = nxcg.CudaMultiGraph(G)
-    default = object()
-    assert G.get_edge_data(0, 0, default=default) is default
-    assert G.get_edge_data("a", "b", default=default) is default
-    assert G.get_edge_data(0, 1, 2, default=default) is default
-    assert G.get_edge_data(-1, 1, default=default) is default
-    assert G.get_edge_data(0, 1, 0, default=default) == {"x": 10}
-    assert G.get_edge_data(0, 1, 1, default=default) == {"y": 20}
-    assert G.get_edge_data(0, 1, default=default) == {0: {"x": 10}, 1: {"y": 20}}
-    assert G.get_edge_data(0, 2, "a", default=default) == {"x": 100}
-    assert G.get_edge_data(0, 2, "b", default=default) == {"y": 200}
-    assert G.get_edge_data(0, 2, default=default) == {"a": {"x": 100}, "b": {"y": 200}}
-    assert G.get_edge_data(0, 3, 0, default=default) == {}
-    assert G.get_edge_data(0, 3, 1, default=default) == {}
-    assert G.get_edge_data(0, 3, 2, default=default) is default
-    assert G.get_edge_data(0, 3, default=default) == {0: {}, 1: {}}
-    assert G.has_edge(0, 1)
-    assert G.has_edge(0, 1, 0)
-    assert G.has_edge(0, 1, 1)
-    assert not G.has_edge(0, 1, 2)
-    assert not G.has_edge(0, 1, "a")
-    assert not G.has_edge(0, -1)
-    assert G.has_edge(0, 2)
-    assert G.has_edge(0, 2, "a")
-    assert G.has_edge(0, 2, "b")
-    assert not G.has_edge(0, 2, "c")
-    assert not G.has_edge(0, 2, 0)
-    assert G.has_edge(0, 3)
-    assert not G.has_edge(0, 0)
-    assert not G.has_edge(0, 0, 0)
-
-    G = nx.MultiGraph()
-    G.add_edge(0, 1)
-    if test_nxcugraph:
-        G = nxcg.CudaMultiGraph(G)
-    assert G.get_edge_data(0, 1, default=default) == {0: {}}
-    assert G.get_edge_data(0, 1, 0, default=default) == {}
-    assert G.get_edge_data(0, 1, 1, default=default) is default
-    assert G.get_edge_data(0, 1, "b", default=default) is default
-    assert G.get_edge_data(-1, 2, default=default) is default
-    assert G.has_edge(0, 1)
-    assert G.has_edge(0, 1, 0)
-    assert not G.has_edge(0, 1, 1)
-    assert not G.has_edge(0, 1, "a")
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py b/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py
deleted file mode 100644
index 252f9e6bbb8..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pandas as pd
-import pytest
-
-
-def test_pagerank_multigraph():
-    """
-    Ensures correct pagerank for Graphs and MultiGraphs when using from_pandas_edgelist.
-
-    PageRank for MultiGraph should give different result compared to Graph; when using
-    a Graph, the duplicate edges should be dropped.
-    """
-    df = pd.DataFrame(
-        {"source": [0, 1, 1, 1, 1, 1, 1, 2], "target": [1, 2, 2, 2, 2, 2, 2, 3]}
-    )
-    expected_pr_for_G = nx.pagerank(nx.from_pandas_edgelist(df))
-    expected_pr_for_MultiG = nx.pagerank(
-        nx.from_pandas_edgelist(df, create_using=nx.MultiGraph)
-    )
-
-    G = nx.from_pandas_edgelist(df, backend="cugraph")
-    actual_pr_for_G = nx.pagerank(G, backend="cugraph")
-
-    MultiG = nx.from_pandas_edgelist(df, create_using=nx.MultiGraph, backend="cugraph")
-    actual_pr_for_MultiG = nx.pagerank(MultiG, backend="cugraph")
-
-    assert actual_pr_for_G == pytest.approx(expected_pr_for_G)
-    assert actual_pr_for_MultiG == pytest.approx(expected_pr_for_MultiG)
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_relabel.py b/python/nx-cugraph/nx_cugraph/tests/test_relabel.py
deleted file mode 100644
index 40bf851d376..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_relabel.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-import pytest
-
-import nx_cugraph as nxcg
-
-from .testing_utils import assert_graphs_equal
-
-
-@pytest.mark.parametrize(
-    "create_using", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
-)
-def test_relabel(create_using):
-    G = nx.complete_graph(3, create_using=create_using)
-    Hnx = nx.relabel_nodes(G, {2: 1})
-    Hcg = nxcg.relabel_nodes(G, {2: 1})
-    assert_graphs_equal(Hnx, Hcg)
-
-    G.add_edge(0, 2, a=11)
-    G.add_edge(1, 2, b=22)
-    Hnx = nx.relabel_nodes(G, {2: 10, 1: 10})
-    Hcg = nxcg.relabel_nodes(G, {2: 10, 1: 10})
-    assert_graphs_equal(Hnx, Hcg)
-
-    G = nx.path_graph(3, create_using=create_using)
-    Hnx = nx.relabel_nodes(G, {2: 0})
-    Hcg = nxcg.relabel_nodes(G, {2: 0})
-    assert_graphs_equal(Hnx, Hcg)
-
-
-@pytest.mark.parametrize("create_using", [nx.MultiGraph, nx.MultiDiGraph])
-def test_relabel_multigraph(create_using):
-    G = nx.empty_graph(create_using=create_using)
-    G.add_edge(0, 1, "x", a=11)
-    G.add_edge(0, 2, "y", a=10, b=6)
-    G.add_edge(0, 0, c=7)
-    G.add_edge(0, 0, "x", a=-1, b=-1, c=-1)
-    Hnx = nx.relabel_nodes(G, {0: 1, 2: 1})
-    Hcg = nxcg.relabel_nodes(G, {0: 1, 2: 1})
-    assert_graphs_equal(Hnx, Hcg)
-    Hnx = nx.relabel_nodes(G, {2: 3, 1: 3, 0: 3})
-    Hcg = nxcg.relabel_nodes(G, {2: 3, 1: 3, 0: 3})
-    assert_graphs_equal(Hnx, Hcg)
-
-
-def test_relabel_nx_input():
-    G = nx.complete_graph(3)
-    with pytest.raises(RuntimeError, match="Using `copy=False` is invalid"):
-        nxcg.relabel_nodes(G, {0: 1}, copy=False)
-    Hnx = nx.relabel_nodes(G, {0: 1}, copy=True)
-    Hcg = nxcg.relabel_nodes(G, {0: 1}, copy=True)
-    assert_graphs_equal(Hnx, Hcg)
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_utils.py b/python/nx-cugraph/nx_cugraph/tests/test_utils.py
deleted file mode 100644
index d38a286fa5d..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_utils.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import numpy as np
-import pytest
-
-from nx_cugraph.utils import _cp_iscopied_asarray, _get_int_dtype
-
-
-def test_get_int_dtype():
-    uint8 = np.dtype(np.uint8)
-    uint16 = np.dtype(np.uint16)
-    uint32 = np.dtype(np.uint32)
-    uint64 = np.dtype(np.uint64)
-    # signed
-    cur = np.iinfo(np.int8)
-    for val in [cur.min, cur.min + 1, -1, 0, 1, cur.max - 1, cur.max]:
-        assert _get_int_dtype(val) == np.int8
-        assert _get_int_dtype(val, signed=True) == np.int8
-        if val >= 0:
-            assert _get_int_dtype(val, unsigned=True) == np.uint8
-            assert _get_int_dtype(val + 1, unsigned=True) == np.uint8
-    prev = cur
-    cur = np.iinfo(np.int16)
-    for val in [cur.min, cur.min + 1, prev.min - 1, prev.max + 1, cur.max - 1, cur.max]:
-        assert _get_int_dtype(val) != prev.dtype
-        assert _get_int_dtype(val, signed=True) == np.int16
-        if val >= 0:
-            assert _get_int_dtype(val, unsigned=True) in {uint8, uint16}
-            assert _get_int_dtype(val + 1, unsigned=True) in {uint8, uint16}
-    prev = cur
-    cur = np.iinfo(np.int32)
-    for val in [cur.min, cur.min + 1, prev.min - 1, prev.max + 1, cur.max - 1, cur.max]:
-        assert _get_int_dtype(val) != prev.dtype
-        assert _get_int_dtype(val, signed=True) == np.int32
-        if val >= 0:
-            assert _get_int_dtype(val, unsigned=True) in {uint16, uint32}
-            assert _get_int_dtype(val + 1, unsigned=True) in {uint16, uint32}
-    prev = cur
-    cur = np.iinfo(np.int64)
-    for val in [cur.min, cur.min + 1, prev.min - 1, prev.max + 1, cur.max - 1, cur.max]:
-        assert _get_int_dtype(val) != prev.dtype
-        assert _get_int_dtype(val, signed=True) == np.int64
-        if val >= 0:
-            assert _get_int_dtype(val, unsigned=True) in {uint32, uint64}
-            assert _get_int_dtype(val + 1, unsigned=True) in {uint32, uint64}
-    with pytest.raises(ValueError, match="Value is too"):
-        _get_int_dtype(cur.min - 1, signed=True)
-    with pytest.raises(ValueError, match="Value is too"):
-        _get_int_dtype(cur.max + 1, signed=True)
-
-    # unsigned
-    cur = np.iinfo(np.uint8)
-    for val in [0, 1, cur.max - 1, cur.max]:
-        assert _get_int_dtype(val) == (np.uint8 if val > 1 else np.int8)
-        assert _get_int_dtype(val, unsigned=True) == np.uint8
-    assert _get_int_dtype(cur.max + 1) == np.int16
-    cur = np.iinfo(np.uint16)
-    for val in [cur.max - 1, cur.max]:
-        assert _get_int_dtype(val, unsigned=True) == np.uint16
-    assert _get_int_dtype(cur.max + 1) == np.int32
-    cur = np.iinfo(np.uint32)
-    for val in [cur.max - 1, cur.max]:
-        assert _get_int_dtype(val, unsigned=True) == np.uint32
-    assert _get_int_dtype(cur.max + 1) == np.int64
-    cur = np.iinfo(np.uint64)
-    for val in [cur.max - 1, cur.max]:
-        assert _get_int_dtype(val, unsigned=True) == np.uint64
-    with pytest.raises(ValueError, match="Value is incompatible"):
-        _get_int_dtype(cur.min - 1, unsigned=True)
-    with pytest.raises(ValueError, match="Value is too"):
-        _get_int_dtype(cur.max + 1, unsigned=True)
-
-    # API
-    with pytest.raises(TypeError, match="incompatible"):
-        _get_int_dtype(7, signed=True, unsigned=True)
-    assert _get_int_dtype(7, signed=True, unsigned=False) == np.int8
-    assert _get_int_dtype(7, signed=False, unsigned=True) == np.uint8
-
-
-def test_cp_iscopied_asarray():
-    # We don't yet run doctest, so do simple copy/paste test here.
-    #
-    # >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3])
-    # >>> is_copied
-    # True
-    # >>> a
-    # array([1, 2, 3])
-    # >>> _cp_iscopied_asarray(a)
-    # (False, array([1, 2, 3]))
-    is_copied, a = _cp_iscopied_asarray([1, 2, 3])
-    assert is_copied is True
-    assert isinstance(a, cp.ndarray)
-    assert repr(a) == "array([1, 2, 3])"
-    assert _cp_iscopied_asarray(a)[0] is False
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_version.py b/python/nx-cugraph/nx_cugraph/tests/test_version.py
deleted file mode 100644
index c45702b6001..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/test_version.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import nx_cugraph
-
-
-def test_version_constants_are_populated():
-    # __git_commit__ will only be non-empty in a built distribution
-    assert isinstance(nx_cugraph.__git_commit__, str)
-
-    # __version__ should always be non-empty
-    assert isinstance(nx_cugraph.__version__, str)
-    assert len(nx_cugraph.__version__) > 0
diff --git a/python/nx-cugraph/nx_cugraph/tests/testing_utils.py b/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
deleted file mode 100644
index 50836acf55f..00000000000
--- a/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import networkx as nx
-
-import nx_cugraph as nxcg
-
-
-def assert_graphs_equal(Gnx, Gcg):
-    assert isinstance(Gnx, nx.Graph)
-    assert isinstance(Gcg, nxcg.CudaGraph)
-    assert (a := Gnx.number_of_nodes()) == (b := Gcg.number_of_nodes()), (a, b)
-    assert (a := Gnx.number_of_edges()) == (b := Gcg.number_of_edges()), (a, b)
-    assert (a := Gnx.is_directed()) == (b := Gcg.is_directed()), (a, b)
-    assert (a := Gnx.is_multigraph()) == (b := Gcg.is_multigraph()), (a, b)
-    G = nxcg.to_networkx(Gcg)
-    rv = nx.utils.graphs_equal(G, Gnx)
-    if not rv:
-        print("GRAPHS ARE NOT EQUAL!")
-        assert sorted(G) == sorted(Gnx)
-        assert sorted(G._adj) == sorted(Gnx._adj)
-        assert sorted(G._node) == sorted(Gnx._node)
-        for k in sorted(G._adj):
-            print(k, sorted(G._adj[k]), sorted(Gnx._adj[k]))
-        print(nx.to_scipy_sparse_array(G).todense())
-        print(nx.to_scipy_sparse_array(Gnx).todense())
-        print(G.graph)
-        print(Gnx.graph)
-    assert rv
diff --git a/python/nx-cugraph/nx_cugraph/typing.py b/python/nx-cugraph/nx_cugraph/typing.py
deleted file mode 100644
index b419a9085e0..00000000000
--- a/python/nx-cugraph/nx_cugraph/typing.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-from collections.abc import Hashable
-from typing import TypeVar
-
-import cupy as cp
-import numpy as np
-
-AttrKey = TypeVar("AttrKey", bound=Hashable)
-EdgeKey = TypeVar("EdgeKey", bound=Hashable)
-NodeKey = TypeVar("NodeKey", bound=Hashable)
-EdgeTuple = tuple[NodeKey, NodeKey]
-EdgeValue = TypeVar("EdgeValue")
-NodeValue = TypeVar("NodeValue")
-IndexValue = TypeVar("IndexValue")
-Dtype = TypeVar("Dtype")
-
-
-class any_ndarray:
-    def __class_getitem__(cls, item):
-        return cp.ndarray[item] | np.ndarray[item]
diff --git a/python/nx-cugraph/nx_cugraph/utils/__init__.py b/python/nx-cugraph/nx_cugraph/utils/__init__.py
deleted file mode 100644
index 6df5fb60978..00000000000
--- a/python/nx-cugraph/nx_cugraph/utils/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .decorators import *
-from .misc import *
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
deleted file mode 100644
index 16486996ba0..00000000000
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-from functools import partial, update_wrapper
-from textwrap import dedent
-
-import networkx as nx
-from networkx import NetworkXError
-from networkx.utils.decorators import nodes_or_number, not_implemented_for
-
-from nx_cugraph import _nxver
-from nx_cugraph.interface import BackendInterface
-
-from .misc import _And_NotImplementedError
-
-try:
-    from networkx.utils.backends import _registered_algorithms
-except ModuleNotFoundError:
-    from networkx.classes.backends import _registered_algorithms
-
-
-__all__ = ["not_implemented_for", "nodes_or_number", "networkx_algorithm"]
-
-
-def networkx_class(api):
-    def inner(func):
-        func.__doc__ = getattr(api, func.__name__).__doc__
-        return func
-
-    return inner
-
-
-class networkx_algorithm:
-    name: str
-    extra_doc: str | None
-    extra_params: dict[str, str] | None
-    version_added: str
-    is_incomplete: bool
-    is_different: bool
-    _fallback: bool
-    _plc_names: set[str] | None
-
-    def __new__(
-        cls,
-        func=None,
-        *,
-        name: str | None = None,
-        # Extra parameter info that is added to NetworkX docstring
-        extra_params: dict[str, str] | str | None = None,
-        # Applies `nodes_or_number` decorator compatibly across versions (3.3 changed)
-        nodes_or_number: list[int] | int | None = None,
-        # Metadata (for introspection only)
-        version_added: str,  # Required
-        is_incomplete: bool = False,  # See self.extra_doc for details if True
-        is_different: bool = False,  # See self.extra_doc for details if True
-        fallback: bool = False,  # Change non-nx exceptions to NotImplementedError
-        _plc: str | set[str] | None = None,  # Hidden from user, may be removed someday
-    ):
-        if func is None:
-            return partial(
-                networkx_algorithm,
-                name=name,
-                extra_params=extra_params,
-                nodes_or_number=nodes_or_number,
-                version_added=version_added,
-                is_incomplete=is_incomplete,
-                is_different=is_different,
-                fallback=fallback,
-                _plc=_plc,
-            )
-        instance = object.__new__(cls)
-        if nodes_or_number is not None and _nxver > (3, 2):
-            func = nx.utils.decorators.nodes_or_number(nodes_or_number)(func)
-        # update_wrapper sets __wrapped__, which will be used for the signature
-        update_wrapper(instance, func)
-        instance.__defaults__ = func.__defaults__
-        instance.__kwdefaults__ = func.__kwdefaults__
-        instance.name = func.__name__ if name is None else name
-        if extra_params is None:
-            pass
-        elif isinstance(extra_params, str):
-            extra_params = {extra_params: ""}
-        elif not isinstance(extra_params, dict):
-            raise TypeError(
-                f"extra_params must be dict, str, or None; got {type(extra_params)}"
-            )
-        instance.extra_params = extra_params
-        if _plc is None or isinstance(_plc, set):
-            instance._plc_names = _plc
-        elif isinstance(_plc, str):
-            instance._plc_names = {_plc}
-        else:
-            raise TypeError(
-                f"_plc argument must be str, set, or None; got {type(_plc)}"
-            )
-        instance.version_added = version_added
-        instance.is_incomplete = is_incomplete
-        instance.is_different = is_different
-        instance.fallback = fallback
-        # The docstring on our function is added to the NetworkX docstring.
-        instance.extra_doc = (
-            dedent(func.__doc__.lstrip("\n").rstrip()) if func.__doc__ else None
-        )
-        # Copy __doc__ from NetworkX
-        if instance.name in _registered_algorithms:
-            instance.__doc__ = _registered_algorithms[instance.name].__doc__
-        instance.can_run = _default_can_run
-        instance.should_run = _default_should_run
-        setattr(BackendInterface, instance.name, instance)
-        # Set methods so they are in __dict__
-        instance._can_run = instance._can_run
-        instance._should_run = instance._should_run
-        if nodes_or_number is not None and _nxver <= (3, 2):
-            instance = nx.utils.decorators.nodes_or_number(nodes_or_number)(instance)
-        return instance
-
-    def _can_run(self, func):
-        """Set the `can_run` attribute to the decorated function."""
-        if not func.__name__.startswith("_"):
-            raise ValueError(
-                "The name of the function used by `_can_run` must begin with '_'; "
-                f"got: {func.__name__!r}"
-            )
-        self.can_run = func
-
-    def _should_run(self, func):
-        """Set the `should_run` attribute to the decorated function."""
-        if not func.__name__.startswith("_"):
-            raise ValueError(
-                "The name of the function used by `_should_run` must begin with '_'; "
-                f"got: {func.__name__!r}"
-            )
-        self.should_run = func
-
-    def __call__(self, /, *args, **kwargs):
-        if not self.fallback:
-            return self.__wrapped__(*args, **kwargs)
-        try:
-            return self.__wrapped__(*args, **kwargs)
-        except NetworkXError:
-            raise
-        except Exception as exc:
-            raise _And_NotImplementedError(exc) from exc
-
-    def __reduce__(self):
-        return _restore_networkx_dispatched, (self.name,)
-
-
-def _default_can_run(*args, **kwargs):
-    return True
-
-
-def _default_should_run(*args, **kwargs):
-    return True
-
-
-def _restore_networkx_dispatched(name):
-    return getattr(BackendInterface, name)
diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
deleted file mode 100644
index 01c25dd5983..00000000000
--- a/python/nx-cugraph/nx_cugraph/utils/misc.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import itertools
-import operator as op
-import sys
-from random import Random
-from typing import TYPE_CHECKING, SupportsIndex
-
-import cupy as cp
-import numpy as np
-
-if TYPE_CHECKING:
-    import nx_cugraph as nxcg
-
-    from ..typing import Dtype, EdgeKey
-
-try:
-    from itertools import pairwise  # Python >=3.10
-except ImportError:
-
-    def pairwise(it):
-        it = iter(it)
-        for prev in it:
-            for cur in it:
-                yield (prev, cur)
-                prev = cur
-
-
-__all__ = [
-    "index_dtype",
-    "_groupby",
-    "_seed_to_int",
-    "_get_int_dtype",
-    "_get_float_dtype",
-    "_dtype_param",
-    "_cp_iscopied_asarray",
-]
-
-# This may switch to np.uint32 at some point
-index_dtype = np.int32
-
-# To add to `extra_params=` of `networkx_algorithm`
-_dtype_param = {
-    "dtype : dtype or None, optional": (
-        "The data type (np.float32, np.float64, or None) to use for the edge weights "
-        "in the algorithm. If None, then dtype is determined by the edge values."
-    ),
-}
-
-
-def _groupby(
-    groups: cp.ndarray | list[cp.ndarray],
-    values: cp.ndarray | list[cp.ndarray],
-    groups_are_canonical: bool = False,
-) -> dict[int, cp.ndarray]:
-    """Perform a groupby operation given an array of group IDs and array of values.
-
-    Parameters
-    ----------
-    groups : cp.ndarray or list of cp.ndarray
-        Array or list of arrays that holds the group IDs.
-    values : cp.ndarray or list of cp.ndarray
-        Array or list of arrays of values to be grouped according to groups.
-        Must be the same size as groups array.
-    groups_are_canonical : bool, default False
-        Whether the group IDs are consecutive integers beginning with 0.
-
-    Returns
-    -------
-    dict with group IDs as keys and cp.ndarray as values.
-    """
-    if isinstance(groups, list):
-        if groups_are_canonical:
-            raise ValueError(
-                "`groups_are_canonical=True` is not allowed when `groups` is a list."
-            )
-        if len(groups) == 0 or (size := groups[0].size) == 0:
-            return {}
-        sort_indices = cp.lexsort(cp.vstack(groups[::-1]))
-        sorted_groups = cp.vstack([group[sort_indices] for group in groups])
-        prepend = sorted_groups[:, 0].max() + 1
-        changed = cp.abs(cp.diff(sorted_groups, prepend=prepend)).sum(axis=0)
-        changed[0] = 1
-        left_bounds = cp.nonzero(changed)[0]
-    else:
-        if (size := groups.size) == 0:
-            return {}
-        sort_indices = cp.argsort(groups)
-        sorted_groups = groups[sort_indices]
-        prepend = 1 if groups_are_canonical else sorted_groups[0] + 1
-        left_bounds = cp.nonzero(cp.diff(sorted_groups, prepend=prepend))[0]
-    if isinstance(values, list):
-        sorted_values = [vals[sort_indices] for vals in values]
-    else:
-        sorted_values = values[sort_indices]
-    boundaries = pairwise(itertools.chain(left_bounds.tolist(), [size]))
-    if groups_are_canonical:
-        it = enumerate(boundaries)
-    elif isinstance(groups, list):
-        it = zip(map(tuple, sorted_groups.T[left_bounds].tolist()), boundaries)
-    else:
-        it = zip(sorted_groups[left_bounds].tolist(), boundaries)
-    if isinstance(values, list):
-        return {
-            group: [sorted_vals[start:end] for sorted_vals in sorted_values]
-            for group, (start, end) in it
-        }
-    return {group: sorted_values[start:end] for group, (start, end) in it}
-
-
-def _seed_to_int(seed: int | Random | None) -> int:
-    """Handle any valid seed argument and convert it to an int if necessary."""
-    if seed is None:
-        return
-    if isinstance(seed, Random):
-        return seed.randint(0, sys.maxsize)
-    return op.index(seed)  # Ensure seed is integral
-
-
-def _get_int_dtype(
-    val: SupportsIndex, *, signed: bool | None = None, unsigned: bool | None = None
-):
-    """Determine the smallest integer dtype that can store the integer ``val``.
-
-    If signed or unsigned are unspecified, then signed integers are preferred
-    unless the value can be represented by a smaller unsigned integer.
-
-    Raises
-    ------
-    ValueError : If the value cannot be represented with an int dtype.
-    """
-    # This is similar in spirit to `np.min_scalar_type`
-    if signed is not None:
-        if unsigned is not None and (not signed) is (not unsigned):
-            raise TypeError(
-                f"signed (={signed}) and unsigned (={unsigned}) keyword arguments "
-                "are incompatible."
-            )
-        signed = bool(signed)
-        unsigned = not signed
-    elif unsigned is not None:
-        unsigned = bool(unsigned)
-        signed = not unsigned
-
-    val = op.index(val)  # Ensure val is integral
-    if val < 0:
-        if unsigned:
-            raise ValueError(f"Value is incompatible with unsigned int: {val}.")
-        signed = True
-        unsigned = False
-
-    if signed is not False:
-        # Number of bytes (and a power of two)
-        signed_nbytes = (val + (val < 0)).bit_length() // 8 + 1
-        signed_nbytes = next(
-            filter(
-                signed_nbytes.__le__,
-                itertools.accumulate(itertools.repeat(2), op.mul, initial=1),
-            )
-        )
-    if unsigned is not False:
-        # Number of bytes (and a power of two)
-        unsigned_nbytes = (val.bit_length() + 7) // 8
-        unsigned_nbytes = next(
-            filter(
-                unsigned_nbytes.__le__,
-                itertools.accumulate(itertools.repeat(2), op.mul, initial=1),
-            )
-        )
-        if signed is None and unsigned is None:
-            # Prefer signed int if same size
-            signed = signed_nbytes <= unsigned_nbytes
-
-    if signed:
-        dtype_string = f"i{signed_nbytes}"
-    else:
-        dtype_string = f"u{unsigned_nbytes}"
-    try:
-        return np.dtype(dtype_string)
-    except TypeError as exc:
-        raise ValueError("Value is too large to store as integer: {val}") from exc
-
-
-def _get_float_dtype(
-    dtype: Dtype, *, graph: nxcg.CudaGraph | None = None, weight: EdgeKey | None = None
-):
-    """Promote dtype to float32 or float64 as appropriate."""
-    if dtype is None:
-        if graph is None or weight not in graph.edge_values:
-            return np.dtype(np.float32)
-        dtype = graph.edge_values[weight].dtype
-    rv = np.promote_types(dtype, np.float32)
-    if np.float32 != rv != np.float64:
-        raise TypeError(
-            f"Dtype {dtype} cannot be safely promoted to float32 or float64"
-        )
-    return rv
-
-
-def _cp_iscopied_asarray(a, *args, orig_object=None, **kwargs):
-    """Like ``cp.asarray``, but also returns whether the input was copied.
-
-    Use this to avoid unnecessary copies. If given, ``orig_object`` will
-    also be inspected to determine if it was copied.
-
-    >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3])
-    >>> is_copied
-    True
-    >>> a
-    array([1, 2, 3])
-    >>> _cp_iscopied_asarray(a)
-    (False, array([1, 2, 3]))
-    """
-    arr = cp.asarray(a, *args, **kwargs)
-    ptr = arr.__cuda_array_interface__["data"][0]
-    if (
-        hasattr(a, "__cuda_array_interface__")
-        and a.__cuda_array_interface__["data"][0] == ptr
-        and (
-            orig_object is None
-            or hasattr(orig_object, "__cuda_array_interface__")
-            and orig_object.__cuda_array_interface__["data"][0] == ptr
-        )
-        # Should we also check device_id?
-        # and getattr(getattr(a, "data", None), "device_id", None) == arr.data.device_id
-    ):
-        return False, arr
-    return True, arr
-
-
-class _And_NotImplementedError(NotImplementedError):
-    """Additionally make an exception a ``NotImplementedError``.
-
-    For example:
-
-    >>> try:
-    ...     raise _And_NotImplementedError(KeyError("missing"))
-    ... except KeyError:
-    ...     pass
-
-    or
-
-    >>> try:
-    ...     raise _And_NotImplementedError(KeyError("missing"))
-    ... except NotImplementedError:
-    ...     pass
-
-    """
-
-    def __new__(cls, exc):
-        exc_type = type(exc)
-        if issubclass(exc_type, NotImplementedError):
-            new_type = exc_type
-        else:
-            new_type = type(
-                f"{exc_type.__name__}{cls.__name__}",
-                (exc_type, NotImplementedError),
-                {},
-            )
-        instance = NotImplementedError.__new__(new_type)
-        instance.__init__(*exc.args)
-        return instance
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
deleted file mode 100644
index 3acda782565..00000000000
--- a/python/nx-cugraph/pyproject.toml
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-[build-system]
-
-requires = [
-    "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "setuptools>=61.0.0",
-    "wheel",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "rapids_build_backend.build"
-
-[project]
-name = "nx-cugraph"
-dynamic = ["version"]
-description = "cugraph backend for NetworkX"
-readme = { file = "README.md", content-type = "text/markdown" }
-authors = [
-    { name = "NVIDIA Corporation" },
-]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.10"
-classifiers = [
-    "Development Status :: 4 - Beta",
-    "License :: OSI Approved :: Apache Software License",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Programming Language :: Python :: 3 :: Only",
-    "Intended Audience :: Developers",
-    "Topic :: Software Development :: Libraries :: Python Modules",
-]
-dependencies = [
-    "cupy-cuda11x>=12.0.0",
-    "networkx>=3.0",
-    "numpy>=1.23,<3.0a0",
-    "pylibcugraph==25.2.*,>=0.0.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.optional-dependencies]
-test = [
-    "pandas",
-    "pytest",
-    "pytest-benchmark",
-    "pytest-cov",
-    "pytest-mpl",
-    "pytest-xdist",
-    "scipy",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.urls]
-Homepage = "https://github.com/rapidsai/cugraph"
-Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
-
-# "plugin" used in nx version < 3.2
-[project.entry-points."networkx.plugins"]
-cugraph = "nx_cugraph.interface:BackendInterface"
-
-[project.entry-points."networkx.plugin_info"]
-cugraph = "_nx_cugraph:get_info"
-
-# "backend" used in nx version >= 3.2
-[project.entry-points."networkx.backends"]
-cugraph = "nx_cugraph.interface:BackendInterface"
-
-[project.entry-points."networkx.backend_info"]
-cugraph = "_nx_cugraph:get_info"
-
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "_nx_cugraph/VERSION"}
-
-[tool.setuptools.packages.find]
-include = [
-    "nx_cugraph*",
-    "nx_cugraph.*",
-    "_nx_cugraph*",
-    "_nx_cugraph.*",
-]
-
-[tool.rapids-build-backend]
-build-backend = "setuptools.build_meta"
-commit-files = ["_nx_cugraph/GIT_COMMIT"]
-dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
-
-[tool.black]
-line-length = 88
-target-version = ["py310", "py311", "py312"]
-
-[tool.isort]
-sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
-profile = "black"
-skip_gitignore = true
-float_to_top = true
-default_section = "THIRDPARTY"
-known_first_party = "nx_cugraph"
-line_length = 88
-extend_skip_glob = [
-    "nx_cugraph/__init__.py",
-    "nx_cugraph/classes/__init__.py",
-]
-
-[tool.pytest.ini_options]
-minversion = "6.0"
-testpaths = "nx_cugraph/tests"
-xfail_strict = true
-markers = [
-    "slow: Skipped unless --runslow passed",
-]
-log_cli_level = "info"
-filterwarnings = [
-    # See: https://docs.python.org/3/library/warnings.html#describing-warning-filters
-    # and: https://docs.pytest.org/en/7.2.x/how-to/capture-warnings.html#controlling-warnings
-    # "error",
-]
-python_files = [
-    "bench_*.py",
-    "test_*.py",
-]
-python_functions = [
-    "bench_*",
-    "test_*",
-]
-addopts = [
-    "--strict-config",  # Force error if config is mispelled
-    "--strict-markers",  # Force error if marker is mispelled (must be defined in config)
-    # "-ra",  # Print summary of all fails/errors
-    "--benchmark-warmup=off",
-    "--benchmark-max-time=0",
-    "--benchmark-min-rounds=1",
-    "--benchmark-columns=min,median,max",
-]
-
-[tool.coverage.run]
-branch = true
-source = ["nx_cugraph"]
-omit = []
-
-[tool.coverage.report]
-ignore_errors = false
-precision = 1
-fail_under = 0
-skip_covered = false  # Nice to see fully covered files when running `run_nx_tests.sh`
-skip_empty = true
-exclude_lines = [
-    "pragma: no cover",
-    "raise AssertionError",
-    "raise NotImplementedError",
-]
-
-[tool.ruff]
-# https://github.com/charliermarsh/ruff/
-line-length = 88
-target-version = "py310"
-[tool.ruff.lint]
-unfixable = [
-    "F841",  # unused-variable (Note: can leave useless expression)
-    "B905",  # zip-without-explicit-strict (Note: prefer `zip(x, y, strict=True)`)
-]
-select = [
-    "ALL",
-]
-external = [
-    # noqa codes that ruff doesn't know about: https://github.com/charliermarsh/ruff#external
-]
-ignore = [
-    # Would be nice to fix these
-    "B905",  # `zip()` without an explicit `strict=` parameter (Note: possible since py39 was dropped; we should do this!)
-    "D100",  # Missing docstring in public module
-    "D101",  # Missing docstring in public class
-    "D102",  # Missing docstring in public method
-    "D103",  # Missing docstring in public function
-    "D104",  # Missing docstring in public package
-    "D105",  # Missing docstring in magic method
-
-    # Maybe consider
-    # "SIM300",  # Yoda conditions are discouraged, use ... instead (Note: we're not this picky)
-    # "SIM401",  # Use dict.get ... instead of if-else-block (Note: if-else better for coverage and sometimes clearer)
-    # "TRY004",  # Prefer `TypeError` exception for invalid type (Note: good advice, but not worth the nuisance)
-    "B020",  # Found for loop that reassigns the iterable it is iterating with each iterable value (too strict)
-    "B904",  # Bare `raise` inside exception clause (like TRY200; sometimes okay)
-    "S310",  # Audit URL open for permitted schemes (Note: we don't download URLs in normal usage)
-
-    # Intentionally ignored
-    "A003",  # Class attribute ... is shadowing a python builtin
-    "ANN101",  # Missing type annotation for `self` in method
-    "ARG004",  # Unused static method argument: `...`
-    "COM812",  # Trailing comma missing
-    "D203",  # 1 blank line required before class docstring (Note: conflicts with D211, which is preferred)
-    "D400",  # First line should end with a period (Note: prefer D415, which also allows "?" and "!")
-    "F403",  # `from .classes import *` used; unable to detect undefined names (Note: used to match networkx)
-    "N801",  # Class name ... should use CapWords convention (Note:we have a few exceptions to this)
-    "N802",  # Function name ... should be lowercase
-    "N803",  # Argument name ... should be lowercase (Maybe okay--except in tests)
-    "N806",  # Variable ... in function should be lowercase
-    "N807",  # Function name should not start and end with `__`
-    "N818",  # Exception name ... should be named with an Error suffix (Note: good advice)
-    "PLR0911",  # Too many return statements
-    "PLR0912",  # Too many branches
-    "PLR0913",  # Too many arguments to function call
-    "PLR0915",  # Too many statements
-    "PLR2004",  # Magic number used in comparison, consider replacing magic with a constant variable
-    "PLW2901",  # Outer for loop variable ... overwritten by inner assignment target (Note: good advice, but too strict)
-    "RET502",  # Do not implicitly `return None` in function able to return non-`None` value
-    "RET503",  # Missing explicit `return` at the end of function able to return non-`None` value
-    "RET504",  # Unnecessary variable assignment before `return` statement
-    "RUF018",  # Avoid assignment expressions in `assert` statements
-    "S110",  # `try`-`except`-`pass` detected, consider logging the exception (Note: good advice, but we don't log)
-    "S112",  # `try`-`except`-`continue` detected, consider logging the exception (Note: good advice, but we don't log)
-    "SIM102",  # Use a single `if` statement instead of nested `if` statements (Note: often necessary)
-    "SIM105",  # Use contextlib.suppress(...) instead of try-except-pass (Note: try-except-pass is much faster)
-    "SIM108",  # Use ternary operator ... instead of if-else-block (Note: if-else better for coverage and sometimes clearer)
-    "TRY003",  # Avoid specifying long messages outside the exception class (Note: why?)
-    "UP038",  # Use `X | Y` in `isinstance` call instead of `(X, Y)` (Note: tuple is faster for now)
-
-    # Ignored categories
-    "C90",  # mccabe (Too strict, but maybe we should make things less complex)
-    "I",  # isort (Should we replace `isort` with this?)
-    "ANN",  # flake8-annotations
-    "BLE",  # flake8-blind-except (Maybe consider)
-    "FBT",  # flake8-boolean-trap (Why?)
-    "DJ",  # flake8-django (We don't use django)
-    "EM",  # flake8-errmsg (Perhaps nicer, but too much work)
-    # "ICN",  # flake8-import-conventions (Doesn't allow "_" prefix such as `_np`)
-    "PYI",  # flake8-pyi (We don't have stub files yet)
-    "SLF",  # flake8-self (We can use our own private variables--sheesh!)
-    "TID",  # flake8-tidy-imports (Rely on isort and our own judgement)
-    # "TCH",  # flake8-type-checking
-    "ARG",  # flake8-unused-arguments (Sometimes helpful, but too strict)
-    "TD",  # flake8-todos (Maybe okay to add some of these)
-    "FIX",  # flake8-fixme (like flake8-todos)
-    "ERA",  # eradicate (We like code in comments!)
-    "PD",  # pandas-vet (Intended for scripts that use pandas, not libraries)
-]
-
-[tool.ruff.lint.per-file-ignores]
-"__init__.py" = ["F401"]  # Allow unused imports (w/o defining `__all__`)
-# Allow assert, print, RNG, and no docstring
-"nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
-"_nx_cugraph/__init__.py" = ["E501"]
-"nx_cugraph/__init__.py" = ["E402"]  # Allow module level import not at top of file
-"nx_cugraph/algorithms/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for algorithms
-"nx_cugraph/generators/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for generators
-"nx_cugraph/interface.py" = ["D401"]  # Flexible docstrings
-"nx_cugraph/convert.py" = ["E721"]  # Allow `dtype == object`
-"scripts/update_readme.py" = ["INP001"]  # Not part of a package
-
-[tool.ruff.lint.flake8-annotations]
-mypy-init-return = true
-
-[tool.ruff.lint.flake8-builtins]
-builtins-ignorelist = ["copyright"]
-
-[tool.ruff.lint.flake8-pytest-style]
-fixture-parentheses = false
-mark-parentheses = false
-
-[tool.ruff.lint.pydocstyle]
-convention = "numpy"
diff --git a/python/nx-cugraph/run_nx_tests.sh b/python/nx-cugraph/run_nx_tests.sh
deleted file mode 100755
index 5fb173cf939..00000000000
--- a/python/nx-cugraph/run_nx_tests.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# NETWORKX_GRAPH_CONVERT=cugraph
-#   Used by networkx versions 3.0 and 3.1
-#   Must be set to "cugraph" to test the nx-cugraph backend.
-#
-# NETWORKX_TEST_BACKEND=cugraph
-#   Replaces NETWORKX_GRAPH_CONVERT for networkx versions >=3.2
-#   Must be set to "cugraph" to test the nx-cugraph backend.
-#
-# NETWORKX_FALLBACK_TO_NX=True (optional)
-#   Used by networkx versions >=3.2.  With this set, input graphs will not be
-#   converted to nx-cugraph and the networkx algorithm will be called for
-#   algorithms that we don't implement or if we raise NotImplementedError.
-#   This is sometimes helpful to get increased testing and coverage, but
-#   testing takes longer.  Without it, tests will xfail when encountering a
-#   function that we don't implement.
-#
-# NX_CUGRAPH_USE_COMPAT_GRAPHS, {"True", "False"}, default is "True"
-#   Whether to use `nxcg.Graph` as the nx_cugraph backend graph.
-#   A Graph should be a compatible NetworkX graph, so fewer tests should fail.
-#
-# Coverage of `nx_cugraph.algorithms` is reported and is a good sanity check
-# that algorithms run.
-
-# Warning: cugraph has a .coveragerc file in the <repo root>/python directory,
-# so be mindful of its contents and the CWD when running.
-# FIXME: should something be added to detect/prevent the above?
-set -e
-NETWORKX_GRAPH_CONVERT=cugraph \
-NETWORKX_TEST_BACKEND=cugraph \
-NETWORKX_FALLBACK_TO_NX=True \
-    pytest \
-    --pyargs networkx \
-    --config-file=$(dirname $0)/pyproject.toml \
-    --cov-config=$(dirname $0)/pyproject.toml \
-    --cov=nx_cugraph \
-    --cov-report= \
-    "$@"
-coverage report \
-    --include="*/nx_cugraph/algorithms/*" \
-    --omit=__init__.py \
-    --show-missing \
-    --rcfile=$(dirname $0)/pyproject.toml
diff --git a/python/nx-cugraph/scripts/update_readme.py b/python/nx-cugraph/scripts/update_readme.py
deleted file mode 100755
index 0dad5d67583..00000000000
--- a/python/nx-cugraph/scripts/update_readme.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import re
-import urllib.request
-import zlib
-from collections import namedtuple
-from pathlib import Path
-from warnings import warn
-
-_objs_file_url = "https://networkx.org/documentation/stable/objects.inv"
-
-# See: https://sphobjinv.readthedocs.io/en/stable/syntax.html
-DocObject = namedtuple(
-    "DocObject",
-    "name, domain, role, priority, uri, displayname",
-)
-
-
-def parse_docobject(line):
-    left, right = line.split(":")
-    name, domain = left.rsplit(" ", 1)
-    role, priority, uri, displayname = right.split(" ", 3)
-    if displayname == "-":
-        displayname = name
-    if uri.endswith("$"):
-        uri = uri[:-1] + name
-    return DocObject(name, domain, role, priority, uri, displayname)
-
-
-def replace_body(text, match, new_body):
-    start, stop = match.span("body")
-    return text[:start] + new_body + text[stop:]
-
-
-# NetworkX isn't perfectly intersphinx-compatible, so manually specify some urls.
-# See: https://github.com/networkx/networkx/issues/7278
-MANUAL_OBJECT_URLS = {
-    "networkx.algorithms.centrality.betweenness": (
-        "https://networkx.org/documentation/stable/reference/"
-        "algorithms/centrality.html#shortest-path-betweenness"
-    ),
-    "networkx.algorithms.centrality.degree_alg": (
-        "https://networkx.org/documentation/stable/reference/"
-        "algorithms/centrality.html#degree"
-    ),
-    "networkx.algorithms.centrality.eigenvector": (
-        "https://networkx.org/documentation/stable/reference/"
-        "algorithms/centrality.html#eigenvector"
-    ),
-    "networkx.algorithms.centrality.katz": (
-        "https://networkx.org/documentation/stable/reference/"
-        "algorithms/centrality.html#eigenvector"
-    ),
-    "networkx.algorithms.components.connected": (
-        "https://networkx.org/documentation/stable/reference/"
-        "algorithms/component.html#connectivity"
-    ),
-    "networkx.algorithms.components.weakly_connected": (
-        "https://networkx.org/documentation/stable/reference/"
-        "algorithms/component.html#weak-connectivity"
-    ),
-    "networkx.classes": (
-        "https://networkx.org/documentation/stable/reference/classes/index.html"
-    ),
-}
-
-
-def main(readme_file, objects_filename):
-    """``readme_file`` must be readable and writable, so use mode ``"a+"``"""
-    from nx_cugraph.scripts.print_tree import create_tree, tree_lines
-
-    # Use the `objects.inv` file to determine URLs. For details about this file, see:
-    # https://sphobjinv.readthedocs.io/en/stable/syntax.html
-    # We might be better off using a library like that, but roll our own for now.
-    with Path(objects_filename).open("rb") as objects_file:
-        line = objects_file.readline()
-        if line != b"# Sphinx inventory version 2\n":
-            raise RuntimeError(f"Bad line in objects.inv:\n\n{line}")
-        line = objects_file.readline()
-        if line != b"# Project: NetworkX\n":
-            raise RuntimeError(f"Bad line in objects.inv:\n\n{line}")
-        line = objects_file.readline()
-        if not line.startswith(b"# Version: "):
-            raise RuntimeError(f"Bad line in objects.inv:\n\n{line}")
-        line = objects_file.readline()
-        if line != b"# The remainder of this file is compressed using zlib.\n":
-            raise RuntimeError(f"Bad line in objects.inv:\n\n{line}")
-        zlib_data = objects_file.read()
-    objects_text = zlib.decompress(zlib_data).decode().strip()
-    objects_list = [parse_docobject(line) for line in objects_text.split("\n")]
-    doc_urls = {
-        obj.name: "https://networkx.org/documentation/stable/" + obj.uri
-        for obj in objects_list
-    }
-    if len(objects_list) != len(doc_urls):
-        raise RuntimeError("Oops; duplicate names found in objects.inv")
-
-    def get_payload(info, **kwargs):
-        path = "networkx." + info.networkx_path
-        subpath, name = path.rsplit(".", 1)
-        # Many objects are referred to in modules above where they are defined.
-        while subpath:
-            path = f"{subpath}.{name}"
-            if path in doc_urls:
-                return f'<a href="{doc_urls[path]}">{name}</a>'
-            subpath = subpath.rsplit(".", 1)[0]
-        warn(f"Unable to find URL for {name!r}: {path}", stacklevel=0)
-        return name
-
-    def get_payload_internal(keys):
-        path = "networkx." + ".".join(keys)
-        name = keys[-1]
-        if path in doc_urls:
-            return f'<a href="{doc_urls[path]}">{name}</a>'
-        path2 = "reference/" + "/".join(keys)
-        if path2 in doc_urls:
-            return f'<a href="{doc_urls[path2]}">{name}</a>'
-        if path in MANUAL_OBJECT_URLS:
-            return f'<a href="{MANUAL_OBJECT_URLS[path]}">{name}</a>'
-        warn(f"Unable to find URL for {name!r}: {path}", stacklevel=0)
-        return name
-
-    readme_file.seek(0)
-    text = readme_file.read()
-    tree = create_tree(get_payload=get_payload)
-    # Algorithms
-    match = re.search(
-        r"### .Algorithms(?P<preamble>.*?)<pre>\n(?P<body>.*?)\n</pre>",
-        text,
-        re.DOTALL,
-    )
-    if not match:
-        raise RuntimeError("Algorithms section not found!")
-    lines = []
-    for key, val in tree["algorithms"].items():
-        lines.append(get_payload_internal(("algorithms", key)))
-        lines.extend(
-            tree_lines(
-                val,
-                parents=("algorithms", key),
-                get_payload_internal=get_payload_internal,
-            )
-        )
-    text = replace_body(text, match, "\n".join(lines))
-    # Generators
-    match = re.search(
-        r"### .Generators(?P<preamble>.*?)<pre>\n(?P<body>.*?)\n</pre>",
-        text,
-        re.DOTALL,
-    )
-    if not match:
-        raise RuntimeError("Generators section not found!")
-    lines = []
-    for key, val in tree["generators"].items():
-        lines.append(get_payload_internal(("generators", key)))
-        lines.extend(
-            tree_lines(
-                val,
-                parents=("generators", key),
-                get_payload_internal=get_payload_internal,
-            )
-        )
-    text = replace_body(text, match, "\n".join(lines))
-    # Other
-    match = re.search(
-        r"### Other\n(?P<preamble>.*?)<pre>\n(?P<body>.*?)\n</pre>",
-        text,
-        re.DOTALL,
-    )
-    if not match:
-        raise RuntimeError("Other section not found!")
-    lines = []
-    for key, val in tree.items():
-        if key in {"algorithms", "generators"}:
-            continue
-        lines.append(get_payload_internal((key,)))
-        lines.extend(
-            tree_lines(val, parents=(key,), get_payload_internal=get_payload_internal)
-        )
-    text = replace_body(text, match, "\n".join(lines))
-    # Now overwrite README.md
-    readme_file.truncate(0)
-    readme_file.write(text)
-    return text
-
-
-def find_or_download_objs_file(objs_file_dir):
-    """Return the path to <objs_file_dir>/objects.inv and download it if necessary.
-
-    Download objects.inv from _objs_file_url if it does not already exist.
-    """
-    objs_file_path = objs_file_dir / "objects.inv"
-    if not objs_file_path.exists():
-        request = urllib.request.Request(_objs_file_url)
-        with (
-            urllib.request.urlopen(request) as response,
-            Path(objs_file_path).open("wb") as out,
-        ):
-            out.write(response.read())
-    return objs_file_path
-
-
-if __name__ == "__main__":
-    # This script imports a nx_cugraph script module, which imports nx_cugraph
-    # runtime dependencies. The script module does not need the runtime deps,
-    # so stub them out to avoid installing them.
-    class Stub:
-        def __getattr__(self, *args, **kwargs):
-            return Stub()
-
-        def __call__(self, *args, **kwargs):
-            return Stub()
-
-    import sys
-
-    sys.modules["cupy"] = Stub()
-    sys.modules["numpy"] = Stub()
-    sys.modules["pylibcugraph"] = Stub()
-
-    parser = argparse.ArgumentParser(
-        "Update README.md to show NetworkX functions implemented by nx-cugraph"
-    )
-    parser.add_argument("readme_filename", help="Path to the README.md file")
-    parser.add_argument(
-        "networkx_objects",
-        nargs="?",
-        default=None,
-        help="Optional path to the objects.inv file from the NetworkX docs. Default is "
-        "the objects.inv file in the directory containing the specified README.md. If "
-        "an objects.inv file does not exist in that location, one will be downloaded "
-        "and saved to that location.",
-    )
-    args = parser.parse_args()
-
-    readme_filename = args.readme_filename
-    readme_path = Path(readme_filename)
-    objects_filename = args.networkx_objects
-    if objects_filename is None:
-        objects_filename = find_or_download_objs_file(readme_path.parent)
-
-    with readme_path.open("a+") as readme_file:
-        main(readme_file, objects_filename)
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index 3a53c7d16c3..fe7c4b64aa5 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -65,7 +65,12 @@ set(cython_sources
     all_pairs_sorensen_coefficients.pyx
     all_pairs_overlap_coefficients.pyx
     all_pairs_cosine_coefficients.pyx
+    heterogeneous_biased_neighbor_sample.pyx
+    heterogeneous_uniform_neighbor_sample.pyx
+    homogeneous_biased_neighbor_sample.pyx
+    homogeneous_uniform_neighbor_sample.pyx
     edge_id_lookup_table.pyx
+    decompress_to_edgelist.pyx
 )
 set(linked_libraries cugraph::cugraph;cugraph::cugraph_c)
 
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 9c04a528fd8..9047144c13a 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -43,6 +43,19 @@
 
 from pylibcugraph.biased_neighbor_sample import biased_neighbor_sample
 
+from pylibcugraph.homogeneous_uniform_neighbor_sample import (
+    homogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.homogeneous_biased_neighbor_sample import (
+    homogeneous_biased_neighbor_sample,
+)
+from pylibcugraph.heterogeneous_uniform_neighbor_sample import (
+    heterogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.heterogeneous_biased_neighbor_sample import (
+    heterogeneous_biased_neighbor_sample,
+)
+
 from pylibcugraph.negative_sampling import negative_sampling
 
 from pylibcugraph.core_number import core_number
@@ -113,6 +126,8 @@
 
 from pylibcugraph.degrees import in_degrees, out_degrees, degrees
 
+from pylibcugraph.decompress_to_edgelist import decompress_to_edgelist
+
 
 from pylibcugraph import exceptions
 
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
index 6d5d5a23cca..21f5190ad5f 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
@@ -178,6 +178,16 @@ cdef extern from "cugraph_c/algorithms.h":
             const cugraph_sample_result_t* result
         )
 
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_edge_renumber_map(
+            const cugraph_sample_result_t* result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_edge_renumber_map_offsets(
+            const cugraph_sample_result_t* result
+        )
+
     # Deprecated, use cugraph_sample_result_get_majors
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_sample_result_get_sources(
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
index b8f16cb94c8..b27a7230a13 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
@@ -122,41 +122,41 @@ cdef extern from "cugraph_c/graph_functions.h":
 
     ###########################################################################
     # induced_subgraph
-    ctypedef struct cugraph_induced_subgraph_result_t:
+    ctypedef struct cugraph_induced_subgraph_result_t: # Deprecated
         pass
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_sources(
+        cugraph_induced_subgraph_get_sources( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_destinations(
+        cugraph_induced_subgraph_get_destinations( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_edge_weights(
+        cugraph_induced_subgraph_get_edge_weights( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_edge_ids(
+        cugraph_induced_subgraph_get_edge_ids( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_edge_type_ids(
+        cugraph_induced_subgraph_get_edge_type_ids( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_subgraph_offsets(
+        cugraph_induced_subgraph_get_subgraph_offsets( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef void \
-        cugraph_induced_subgraph_result_free(
+        cugraph_induced_subgraph_result_free( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
@@ -250,3 +250,52 @@ cdef extern from "cugraph_c/graph_functions.h":
         cugraph_degrees_result_free(
             cugraph_degrees_result_t* degrees_result
         )
+
+    ###########################################################################
+    # decompress to edgelist
+    ctypedef struct cugraph_edgelist_t:
+        pass
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_sources(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_destinations(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_weights(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_ids(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_type_ids(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_offsets(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef void \
+        cugraph_edgelist_free(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_decompress_to_edgelist(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            bool_t do_expensive_check,
+            cugraph_edgelist_t** result,
+            cugraph_error_t** error
+        )
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
index 3f7b8b9ae29..762fd37a35d 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
@@ -67,6 +67,62 @@ cdef extern from "cugraph_c/sampling_algorithms.h":
         cugraph_error_t** error
     )
 
+    cdef cugraph_error_code_t cugraph_heterogeneous_uniform_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        int num_edge_types,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
+    cdef cugraph_error_code_t cugraph_heterogeneous_biased_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_edge_property_view_t* edge_biases,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        int num_edge_types,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
+    cdef cugraph_error_code_t cugraph_homogeneous_uniform_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
+    cdef cugraph_error_code_t cugraph_homogeneous_biased_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_edge_property_view_t* edge_biases,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
     cdef cugraph_error_code_t cugraph_biased_neighbor_sample(
         const cugraph_resource_handle_t* handle,
         cugraph_graph_t* graph,
diff --git a/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx b/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx
new file mode 100644
index 00000000000..58c29940aba
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx
@@ -0,0 +1,169 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_edgelist_t,
+    cugraph_decompress_to_edgelist,
+    cugraph_edgelist_get_sources,
+    cugraph_edgelist_get_destinations,
+    cugraph_edgelist_get_edge_weights,
+    cugraph_edgelist_get_edge_ids,
+    cugraph_edgelist_get_edge_type_ids,
+    cugraph_edgelist_get_edge_offsets,
+    cugraph_edgelist_free,
+)
+
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+)
+
+
+def decompress_to_edgelist(ResourceHandle resource_handle,
+                           _GPUGraph graph,
+                           bool_t do_expensive_check):
+    """
+    Extract a the edgelist from a graph
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph.
+
+    do_expensive_check : bool_t
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the sources, destinations and if applicable
+    edge_weights, edge_ids and/or edge_type_ids.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5], dtype=numpy.int32)
+    >>> weights = cupy.asarray(
+    ...     [0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=False, renumber=False, do_expensive_check=False)
+    >>> (sources, destinations, edge_weights, _, _) =
+    ...     pylibcugraph.decompress_to_edgelist(
+    ...         resource_handle, G, False)
+    >>> sources
+    [0, 1, 1, 2, 2, 2, 3, 4]
+    >>> destinations
+    [1, 3, 4, 0, 1, 3, 5, 5]
+    >>> edge_weights
+    [0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2]
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+    cdef cugraph_edgelist_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    error_code = cugraph_decompress_to_edgelist(c_resource_handle_ptr,
+                                                c_graph_ptr,
+                                                do_expensive_check,
+                                                &result_ptr,
+                                                &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_decompress_to_edgelist")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* sources_ptr = \
+        cugraph_edgelist_get_sources(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* destinations_ptr = \
+        cugraph_edgelist_get_destinations(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* edge_weights_ptr = \
+        cugraph_edgelist_get_edge_weights(result_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* edge_ids_ptr = \
+        cugraph_edgelist_get_edge_ids(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* edge_type_ids_ptr = \
+        cugraph_edgelist_get_edge_type_ids(result_ptr)
+
+
+    """
+    cdef cugraph_type_erased_device_array_view_t* subgraph_offsets_ptr = \
+        cugraph_edgelist_get_edge_offsets(result_ptr)
+    """
+
+    # FIXME: Get ownership of the result data instead of performing a copy
+    # for perfomance improvement
+    cupy_edge_weights = None
+    cupy_edge_ids = None
+    cupy_edge_type_ids = None
+    cupy_sources = copy_to_cupy_array(
+        c_resource_handle_ptr, sources_ptr)
+    cupy_destinations = copy_to_cupy_array(
+        c_resource_handle_ptr, destinations_ptr)
+    if edge_weights_ptr != NULL:
+        cupy_edge_weights = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_weights_ptr)
+    if edge_ids_ptr != NULL:
+        cupy_edge_ids = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_ids_ptr)
+    if edge_type_ids_ptr != NULL:
+        cupy_edge_type_ids = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_type_ids_ptr)
+
+    """
+    cupy_subgraph_offsets = copy_to_cupy_array(
+        c_resource_handle_ptr, subgraph_offsets_ptr)
+    """
+
+    # Free pointer
+    cugraph_edgelist_free(result_ptr)
+
+    return (cupy_sources, cupy_destinations,
+                cupy_edge_weights, cupy_edge_ids, cupy_edge_type_ids)
diff --git a/python/pylibcugraph/pylibcugraph/heterogeneous_biased_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/heterogeneous_biased_neighbor_sample.pyx
new file mode 100644
index 00000000000..ecdfba3afc5
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/heterogeneous_biased_neighbor_sample.pyx
@@ -0,0 +1,428 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.properties cimport (
+    cugraph_edge_property_view_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_heterogeneous_biased_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def heterogeneous_biased_neighbor_sample(ResourceHandle resource_handle,
+                                         _GPUGraph input_graph,
+                                         start_vertex_list,
+                                         starting_vertex_label_offsets,
+                                         h_fan_out,
+                                         num_edge_types,
+                                         bool_t with_replacement,
+                                         bool_t do_expensive_check,
+                                         prior_sources_behavior=None,
+                                         deduplicate_sources=False,
+                                         return_hops=False,
+                                         renumber=False,
+                                         retain_seeds=False,
+                                         compression='COO',
+                                         compress_per_hop=False,
+                                         random_state=None):
+    """
+    Performs biased neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled with biases. Heterogeneous
+    neighborhood sampling translates to more than 1 edge types.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    edge_biases: not supported.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level. The fanout value at each hop for each
+        edge type is given by the relationship
+        h_fanout[x*num_edge_types + edge_type_id]
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    num_edge_types: int
+        Number of edge types where a value of 1 translates to homogeneous neighbor
+        sample whereas a value greater than 1 translates to heterogeneous neighbor
+        sample.
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5],
+    ...                     dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4],
+    ...                     dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1,
+    ...                         1.1, 5.1, 3.1,  4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> edge_types = cupy.asarray([0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1],
+    ...                            dtype=numpy.int32)
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> num_edge_types = 2
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=False, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.heterogeneous_biased_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, num_edge_types, False, True)
+    >>> sampling_results
+    {'majors': array([2, 2, 2, 5, 5, 1, 1, 1, 1], dtype=int32),
+     'minors': array([0, 1, 3, 3, 4, 0, 2, 3, 4], dtype=int32),
+     'weight': array([5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 3.1, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_heterogeneous_biased_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        <cugraph_edge_property_view_t*>NULL, # FIXME: Add support for biased neighbor sampling
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        num_edge_types,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_heterogeneous_biased_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+
+
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/heterogeneous_uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/heterogeneous_uniform_neighbor_sample.pyx
new file mode 100644
index 00000000000..3fa3575e27d
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/heterogeneous_uniform_neighbor_sample.pyx
@@ -0,0 +1,419 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_heterogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def heterogeneous_uniform_neighbor_sample(ResourceHandle resource_handle,
+                                          _GPUGraph input_graph,
+                                          start_vertex_list,
+                                          starting_vertex_label_offsets,
+                                          h_fan_out,
+                                          num_edge_types,
+                                          bool_t with_replacement,
+                                          bool_t do_expensive_check,
+                                          prior_sources_behavior=None,
+                                          deduplicate_sources=False,
+                                          return_hops=False,
+                                          renumber=False,
+                                          retain_seeds=False,
+                                          compression='COO',
+                                          compress_per_hop=False,
+                                          random_state=None):
+    """
+    Performs uniform neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled uniformly. Heterogeneous
+    neighborhood sampling translates to more than 1 edge types.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level. The fanout value at each hop for each
+        edge type is given by the relationship
+        h_fanout[x*num_edge_types + edge_type_id]
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    num_edge_types: int
+        Number of edge types where a value of 1 translates to homogeneous neighbor
+        sample whereas a value greater than 1 translates to heterogeneous neighbor
+        sample.
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5],
+    ...                     dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4],
+    ...                     dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1,
+    ...                         1.1, 5.1, 3.1,  4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> edge_types = cupy.asarray([0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1],
+    ...                            dtype=numpy.int32)
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> num_edge_types = 2
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=False, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.heterogeneous_uniform_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, num_edge_types, False, True)
+    >>> sampling_results
+    {'majors': array([2, 2, 2, 5, 5, 1, 1, 1, 1], dtype=int32),
+     'minors': array([0, 1, 3, 3, 4, 0, 2, 3, 4], dtype=int32),
+     'weight': array([5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 3.1, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_heterogeneous_uniform_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        num_edge_types,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_heterogeneous_uniform_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/homogeneous_biased_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/homogeneous_biased_neighbor_sample.pyx
new file mode 100644
index 00000000000..e2476de1607
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/homogeneous_biased_neighbor_sample.pyx
@@ -0,0 +1,418 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.properties cimport (
+    cugraph_edge_property_view_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_homogeneous_biased_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def homogeneous_biased_neighbor_sample(ResourceHandle resource_handle,
+                                       _GPUGraph input_graph,
+                                       start_vertex_list,
+                                       starting_vertex_label_offsets,
+                                       h_fan_out,
+                                       bool_t with_replacement,
+                                       bool_t do_expensive_check,
+                                       prior_sources_behavior=None,
+                                       deduplicate_sources=False,
+                                       return_hops=False,
+                                       renumber=False,
+                                       retain_seeds=False,
+                                       compression='COO',
+                                       compress_per_hop=False,
+                                       random_state=None):
+    """
+    Performs biased neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled with biases. Homogeneous
+    neighborhood sampling translates to 1 edge type.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    edge_biases: not supported.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: tuple of numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4], dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1, 1.1, 5.1, 3.1,
+    ...                         4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> start_vertices = cupy.asarray([2, 5]).astype(numpy.int32)
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.homogeneous_biased_neighbor_sample(
+    ...         resource_handle, G, start_vertices, None, h_fan_out, False, True)
+    >>> sampling_results
+    {'sources': array([2, 2, 5, 5], dtype=int32),
+     'destinations': array([1, 3, 3, 4], dtype=int32),
+     'indices': array([3.1, 4.1, 7.2, 3.2], dtype=float32)}
+
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> sampling_results = pylibcugraph.homogeneous_biased_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, False, True)
+    >>> >>> sampling_results
+    {'majors': array([2, 2, 5, 5, 1, 1], dtype=int32),
+     'minors': array([1, 3, 3, 4, 3, 4], dtype=int32),
+     'weight': array([3.1, 4.1, 7.2, 3.2, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_homogeneous_biased_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        <cugraph_edge_property_view_t*>NULL, # FIXME: Add support for biased neighbor sampling
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_homogeneous_biased_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/homogeneous_uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/homogeneous_uniform_neighbor_sample.pyx
new file mode 100644
index 00000000000..3c6cdf77420
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/homogeneous_uniform_neighbor_sample.pyx
@@ -0,0 +1,413 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_homogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def homogeneous_uniform_neighbor_sample(ResourceHandle resource_handle,
+                                          _GPUGraph input_graph,
+                                          start_vertex_list,
+                                          starting_vertex_label_offsets,
+                                          h_fan_out,
+                                          bool_t with_replacement,
+                                          bool_t do_expensive_check,
+                                          prior_sources_behavior=None,
+                                          deduplicate_sources=False,
+                                          return_hops=False,
+                                          renumber=False,
+                                          retain_seeds=False,
+                                          compression='COO',
+                                          compress_per_hop=False,
+                                          random_state=None):
+    """
+    Performs biased neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled uniformly. Homogeneous
+    neighborhood sampling translates to 1 edge type.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4], dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1, 1.1, 5.1, 3.1,
+    ...                         4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> start_vertices = cupy.asarray([2, 5]).astype(numpy.int32)
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.homogeneous_uniform_neighbor_sample(
+    ...         resource_handle, G, start_vertices, None, h_fan_out, False, True)
+    >>> sampling_results
+    {'sources': array([2, 2, 5, 5], dtype=int32),
+     'destinations': array([1, 3, 3, 4], dtype=int32),
+     'indices': array([3.1, 4.1, 7.2, 3.2], dtype=float32)}
+
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> sampling_results = pylibcugraph.homogeneous_uniform_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, False, True)
+    >>> >>> sampling_results
+    {'majors': array([2, 2, 5, 5, 1, 1], dtype=int32),
+     'minors': array([1, 3, 3, 4, 3, 4], dtype=int32),
+     'weight': array([3.1, 4.1, 7.2, 3.2, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_homogeneous_uniform_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_homogeneous_uniform_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
index f588237942b..b93618d73ce 100644
--- a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
+++ b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
@@ -34,6 +34,8 @@ from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sample_result_get_offsets, # deprecated
     cugraph_sample_result_get_renumber_map,
     cugraph_sample_result_get_renumber_map_offsets,
+    cugraph_sample_result_get_edge_renumber_map,
+    cugraph_sample_result_get_edge_renumber_map_offsets,
     cugraph_sample_result_free,
 )
 from pylibcugraph.utils cimport (
@@ -257,3 +259,30 @@ cdef class SamplingResult:
 
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
+
+
+    def get_edge_renumber_map(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_edge_renumber_map(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)
+
+    def get_edge_renumber_map_offsets(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_edge_renumber_map_offsets(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)
diff --git a/readme_pages/cugraph_dgl.md b/readme_pages/cugraph_dgl.md
deleted file mode 100644
index 7b19787f4c6..00000000000
--- a/readme_pages/cugraph_dgl.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# cugraph_dgl
-
-[RAPIDS](https://rapids.ai) cugraph_dgl enables the ability to use cugraph Property Graphs with DGL.  This cugraph backend allows DGL users access to a collection of GPU-accelerated algorithms for graph analytics, such as sampling, centrality computation, and community detection.
-
-
-The goal of `cugraph_dgl` is to enable Multi-Node Multi-GPU cugraph accelerated graphs to help train large-scale Graph Neural Networks(GNN) on DGL by providing a duck-typed version of the [DGLGraph](https://docs.dgl.ai/api/python/dgl.DGLGraph.html#dgl.DGLGraph)  which uses cugraph for storing graph structure and node/edge feature data.
-
-## Usage
-```diff
-
-+from cugraph_dgl.convert import cugraph_storage_from_heterograph
-+cugraph_g = cugraph_storage_from_heterograph(dgl_g)
-
-sampler = dgl.dataloading.NeighborSampler(
-        [15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label'])
-
-train_dataloader = dgl.dataloading.DataLoader(
-- dgl_g,
-+ cugraph_g,
-train_idx,
-sampler,
-device=device,
-batch_size=1024,
-shuffle=True,
-drop_last=False,
-num_workers=0)
-```
diff --git a/readme_pages/cugraph_pyg.md b/readme_pages/cugraph_pyg.md
deleted file mode 100644
index 147cd70b944..00000000000
--- a/readme_pages/cugraph_pyg.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# cugraph_pyg
-
-[RAPIDS](https://rapids.ai) cugraph_pyg enables the ability to use cugraph Property Graphs with PyTorch Geometric (PyG).  PyG users will have access to cuGraph and cuGraph-Service through the PyG GraphStore, FeatureStore, and Sampler interfaces.  Through cugraph_pyg, PyG users have the full power of cuGraph's GPU-accelerated algorithms for graph analytics, such as sampling, centrality computation, and community detection.
-
-
-The goal of `cugraph_pyg` is to enable accelerated single-GPU and multi-node, multi-GPU cugraph accelerated graphs to help train large-scale Graph Neural Networks (GNN) on PyG by providing duck-typed drop-in replacements of the `GraphStore`, `FeatureStore`, and `Sampler` interfaces backed by either cuGraph or cuGraph-Service.
-
-Users of cugraph_pyg have the option of installing either the cugraph or cugraph_service_client packages.  Only one is required.
-
-## Usage
-```
-G = cuGraph.PropertyGraph()
-...
-feature_store, graph_store = to_pyg(G)
-sampler = CuGraphSampler(
-    data=(feature_store, graph_store),
-    shuffle=True,
-    num_neighbors=[10,25],
-    batch_size=50,
-)
-...
-```
diff --git a/readme_pages/gnn_support.md b/readme_pages/gnn_support.md
index 924c2bf62af..72978883531 100644
--- a/readme_pages/gnn_support.md
+++ b/readme_pages/gnn_support.md
@@ -27,6 +27,6 @@ An overview of GNN's and how they are used is found in this excellent [blog](htt
 
 RAPIDS GNN components improve other industy GNN specific projects. Due to the degree distribution of nodes, memory bottlenecks are the pain point for large scale graphs. To solve this problem, sampling operations form the backbone for Graph Neural Networks (GNN) training. However, current sampling methods provided by other libraries are not optimized enough for the whole process of GNN training. The main limit to performance is moving data between the hosts and devices. In cuGraph, we provide an end-to-end solution from data loading to training all on the GPUs.
 
-CuGraph now supports compatibility with [Deep Graph Library](https://www.dgl.ai/) (DGL) and [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) (PyG) by allowing conversion between a cuGraph object and a DGL or PyG object, making it possible for DGL and PyG users to access efficient data loader and graph operations (such as uniformed sampling) implementations in cuGraph, as well as keep their models unchanged in DGL or PyG. We have considerable speedup compared with the original implementation in DGL and PyG.
+CuGraph now supports compatibility with [Deep Graph Library](https://www.dgl.ai/) (DGL) and [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) (PyG) by allowing conversion between a cuGraph object and a DGL or PyG object, making it possible for DGL and PyG users to access efficient data loader and graph operations (such as uniformed sampling) implementations in cuGraph, as well as keep their models unchanged in DGL or PyG. We have considerable speedup compared with the original implementation in DGL and PyG. The GNN packages are now developed within the [cugraph-gnn](https://github.com/rapidsai/cugraph-gnn) repository.
 
 [<img src="../img/gnn_context.png">](https://developer.nvidia.com/blog/optimizing-fraud-detection-in-financial-services-with-graph-neural-networks-and-nvidia-gpus/)